From 9ebd67d77c4478d2e0e74705706121462e3d6408 Mon Sep 17 00:00:00 2001 From: Yinzuo Jiang Date: Mon, 2 Dec 2024 11:34:32 +0800 Subject: [PATCH] enhance: update float16/bfloat16 examples (#2388) In the Python ecosystem, users may use basic libraries such as numpy, Pandas, TensorFlow, PyTorch... to process float16/bfloat16 vectors. However, users may have float32 vectors and are not clear about how to handle float16/bfloat16 vectors in pymilvus. Currently, pymilvus supports numpy array as embedding vector inputs. However, numpy itself does not support bfloat16 type. This PR demonstrates the way of converting float arrays in insert/search API. **insert (accept numpy array as input)**: - float32 vector (owned by users) -> float16 vector (input param of insert API). numpy is enough, no more dependency. - float32 vector (owned by users) -> bfloat16 vector (input param of insert API). Depends on `tf.bfloat16`. Pytorch can not convert `torch.bfloat16` to numpy array. **search (the API returns bytes as float16/bfloat16 vector)**: - float16 vector (bytes). User can convert it into numpy array, PyTorch Tensor or TensorFlow Tensor. - bfloat16 vector (bytes). User can convert it into PyTorch Tensor or TensorFlow Tensor. There are many deep learning platforms available in Python, and we can't determine which ecosystem users want. Therefore, this PR doesn't add the method for float vector conversion in pymilvus. References: - https://github.com/numpy/numpy/issues/19808 - https://github.com/pytorch/pytorch/issues/90574 issue: milvus-io/milvus#37448 Signed-off-by: Yinzuo Jiang Signed-off-by: Yinzuo Jiang --- examples/datatypes/bfloat16_example.py | 12 ++++++++++-- examples/datatypes/float16_example.py | 10 +++++++--- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/examples/datatypes/bfloat16_example.py b/examples/datatypes/bfloat16_example.py index 06064794a..cf62ecd4e 100644 --- a/examples/datatypes/bfloat16_example.py +++ b/examples/datatypes/bfloat16_example.py @@ -2,6 +2,7 @@ import random import numpy as np import tensorflow as tf +import torch from pymilvus import ( connections, utility, @@ -20,6 +21,11 @@ def gen_bf16_vectors(num, dim): for _ in range(num): raw_vector = [random.random() for _ in range(dim)] raw_vectors.append(raw_vector) + # Numpy itself does not support bfloat16, use TensorFlow extension instead. + # PyTorch does not support converting bfloat16 vector to numpy array. + # See: + # - https://github.com/numpy/numpy/issues/19808 + # - https://github.com/pytorch/pytorch/issues/90574 bf16_vector = tf.cast(raw_vector, dtype=tf.bfloat16).numpy() bf16_vectors.append(bf16_vector) return raw_vectors, bf16_vectors @@ -57,8 +63,10 @@ def bf16_vector_search(): index_params={"index_type": index_type, "params": index_params, "metric_type": "L2"}) hello_milvus.load() print("index_type = ", index_type) - res = hello_milvus.search(vectors[0:10], vector_field_name, {"metric_type": "L2"}, limit=1) - print(res) + res = hello_milvus.search(vectors[0:10], vector_field_name, {"metric_type": "L2"}, limit=1, output_fields=["bfloat16_vector"]) + print("raw bytes: ", res[0][0].get("bfloat16_vector")) + print("tensorflow Tensor: ", tf.io.decode_raw(res[0][0].get("bfloat16_vector"), tf.bfloat16, little_endian=True)) + print("pytorch Tensor: ", torch.frombuffer(res[0][0].get("bfloat16_vector"), dtype=torch.bfloat16)) hello_milvus.release() hello_milvus.drop_index() diff --git a/examples/datatypes/float16_example.py b/examples/datatypes/float16_example.py index d3cc519d6..cf9bde21a 100644 --- a/examples/datatypes/float16_example.py +++ b/examples/datatypes/float16_example.py @@ -13,13 +13,16 @@ default_fp16_index_params = [{"nlist": 128}] +# float16, little endian +fp16_little = np.dtype('e').newbyteorder('<') + def gen_fp16_vectors(num, dim): raw_vectors = [] fp16_vectors = [] for _ in range(num): raw_vector = [random.random() for _ in range(dim)] raw_vectors.append(raw_vector) - fp16_vector = np.array(raw_vector, dtype=np.float16) + fp16_vector = np.array(raw_vector, dtype=fp16_little) fp16_vectors.append(fp16_vector) return raw_vectors, fp16_vectors @@ -57,8 +60,9 @@ def fp16_vector_search(): index_params={"index_type": index_type, "params": index_params, "metric_type": "L2"}) hello_milvus.load() print("index_type = ", index_type) - res = hello_milvus.search(vectors[0:10], vector_field_name, {"metric_type": "L2"}, limit=1) - print(res) + res = hello_milvus.search(vectors[0:10], vector_field_name, {"metric_type": "L2"}, limit=1, output_fields=["float16_vector"]) + print("raw bytes: ", res[0][0].get("float16_vector")) + print("numpy ndarray: ", np.frombuffer(res[0][0].get("float16_vector"), dtype=fp16_little)) hello_milvus.release() hello_milvus.drop_index()