From 9ebd67d77c4478d2e0e74705706121462e3d6408 Mon Sep 17 00:00:00 2001
From: Yinzuo Jiang <jiangyinzuo@foxmail.com>
Date: Mon, 2 Dec 2024 11:34:32 +0800
Subject: [PATCH] enhance: update float16/bfloat16 examples (#2388)

In the Python ecosystem, users may use basic libraries such as numpy,
Pandas, TensorFlow, PyTorch... to process float16/bfloat16 vectors.
However, users may have float32 vectors and are not clear about
how to handle float16/bfloat16 vectors in pymilvus.

Currently, pymilvus supports numpy array as embedding vector inputs.
However, numpy itself does not support bfloat16 type.

This PR demonstrates the way of converting float arrays in insert/search
API.

**insert (accept numpy array as input)**:

- float32 vector (owned by users) -> float16 vector (input param of
insert API). numpy is enough, no more dependency.
- float32 vector (owned by users) -> bfloat16 vector (input param of
insert API). Depends on `tf.bfloat16`. Pytorch can not convert
`torch.bfloat16` to numpy array.

**search (the API returns bytes as float16/bfloat16 vector)**:

- float16 vector (bytes). User can convert it into numpy array, PyTorch
Tensor or TensorFlow Tensor.
- bfloat16 vector (bytes). User can convert it into PyTorch Tensor or
TensorFlow Tensor.

There are many deep learning platforms available in Python, and
we can't determine which ecosystem users want. Therefore, this PR
doesn't add the method for float vector conversion in pymilvus.

References:

- https://github.com/numpy/numpy/issues/19808
- https://github.com/pytorch/pytorch/issues/90574

issue: milvus-io/milvus#37448

Signed-off-by: Yinzuo Jiang <yinzuo.jiang@zilliz.com>
Signed-off-by: Yinzuo Jiang <jiangyinzuo@foxmail.com>
---
 examples/datatypes/bfloat16_example.py | 12 ++++++++++--
 examples/datatypes/float16_example.py  | 10 +++++++---
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/examples/datatypes/bfloat16_example.py b/examples/datatypes/bfloat16_example.py
index 06064794a..cf62ecd4e 100644
--- a/examples/datatypes/bfloat16_example.py
+++ b/examples/datatypes/bfloat16_example.py
@@ -2,6 +2,7 @@
 import random
 import numpy as np
 import tensorflow as tf
+import torch
 from pymilvus import (
      connections,
      utility,
@@ -20,6 +21,11 @@ def gen_bf16_vectors(num, dim):
     for _ in range(num):
         raw_vector = [random.random() for _ in range(dim)]
         raw_vectors.append(raw_vector)
+        # Numpy itself does not support bfloat16, use TensorFlow extension instead.
+        # PyTorch does not support converting bfloat16 vector to numpy array.
+        # See:
+        # - https://github.com/numpy/numpy/issues/19808
+        # - https://github.com/pytorch/pytorch/issues/90574
         bf16_vector = tf.cast(raw_vector, dtype=tf.bfloat16).numpy()
         bf16_vectors.append(bf16_vector)
     return raw_vectors, bf16_vectors
@@ -57,8 +63,10 @@ def bf16_vector_search():
                                   index_params={"index_type": index_type, "params": index_params, "metric_type": "L2"})
         hello_milvus.load()
         print("index_type = ", index_type)
-        res = hello_milvus.search(vectors[0:10], vector_field_name, {"metric_type": "L2"}, limit=1)
-        print(res)
+        res = hello_milvus.search(vectors[0:10], vector_field_name, {"metric_type": "L2"}, limit=1, output_fields=["bfloat16_vector"])
+        print("raw bytes: ", res[0][0].get("bfloat16_vector"))
+        print("tensorflow Tensor: ", tf.io.decode_raw(res[0][0].get("bfloat16_vector"), tf.bfloat16, little_endian=True))
+        print("pytorch Tensor: ", torch.frombuffer(res[0][0].get("bfloat16_vector"), dtype=torch.bfloat16))
         hello_milvus.release()
         hello_milvus.drop_index()
 
diff --git a/examples/datatypes/float16_example.py b/examples/datatypes/float16_example.py
index d3cc519d6..cf9bde21a 100644
--- a/examples/datatypes/float16_example.py
+++ b/examples/datatypes/float16_example.py
@@ -13,13 +13,16 @@
 
 default_fp16_index_params = [{"nlist": 128}]
 
+# float16, little endian
+fp16_little = np.dtype('e').newbyteorder('<')
+
 def gen_fp16_vectors(num, dim):
     raw_vectors = []
     fp16_vectors = []
     for _ in range(num):
         raw_vector = [random.random() for _ in range(dim)]
         raw_vectors.append(raw_vector)
-        fp16_vector = np.array(raw_vector, dtype=np.float16)
+        fp16_vector = np.array(raw_vector, dtype=fp16_little)
         fp16_vectors.append(fp16_vector)
     return raw_vectors, fp16_vectors
 
@@ -57,8 +60,9 @@ def fp16_vector_search():
                                   index_params={"index_type": index_type, "params": index_params, "metric_type": "L2"})
         hello_milvus.load()
         print("index_type = ", index_type)
-        res = hello_milvus.search(vectors[0:10], vector_field_name, {"metric_type": "L2"}, limit=1)
-        print(res)
+        res = hello_milvus.search(vectors[0:10], vector_field_name, {"metric_type": "L2"}, limit=1, output_fields=["float16_vector"])
+        print("raw bytes: ", res[0][0].get("float16_vector"))
+        print("numpy ndarray: ", np.frombuffer(res[0][0].get("float16_vector"), dtype=fp16_little))
         hello_milvus.release()
         hello_milvus.drop_index()