Merge pull request #1171 from weaviate/1.26/fix-updating-quantizers

Fix updating quantizers between `pq`, `bq`, and `sq` with `hnsw` index
weaviate · Jul 9, 2024 · dec088a · dec088a
2 parents e1343b6 + 772c25e
commit dec088a
Show file tree

Hide file tree

Showing 5 changed files with 290 additions and 22 deletions.
diff --git a/integration/test_collection_config.py b/integration/test_collection_config.py
@@ -15,6 +15,7 @@
     _VectorIndexConfigDynamic,
     _VectorIndexConfigFlat,
     _VectorIndexConfigHNSW,
+    _VectorIndexConfigHNSWUpdate,
     Configure,
     Reconfigure,
     Property,
@@ -31,6 +32,7 @@
     _RerankerConfigCreate,
 )
 from weaviate.collections.classes.tenants import Tenant
+from weaviate.exceptions import UnexpectedStatusCodeError, WeaviateInvalidInputError
 
 
 @pytest.fixture(scope="module")
@@ -551,6 +553,28 @@ def test_hnsw_with_sq(collection_factory: CollectionFactory) -> None:
     assert isinstance(config.vector_index_config.quantizer, _SQConfig)
 
 
+@pytest.mark.parametrize(
+    "vector_index_config",
+    [
+        Reconfigure.VectorIndex.hnsw(quantizer=Reconfigure.VectorIndex.Quantizer.bq()),
+        Reconfigure.VectorIndex.hnsw(quantizer=Reconfigure.VectorIndex.Quantizer.sq()),
+    ],
+)
+def test_update_from_pq_with_hnsw(
+    collection_factory: CollectionFactory, vector_index_config: _VectorIndexConfigHNSWUpdate
+) -> None:
+    collection = collection_factory(
+        vector_index_config=Configure.VectorIndex.hnsw(
+            vector_cache_max_objects=5,
+            quantizer=Configure.VectorIndex.Quantizer.pq(
+                centroids=128,
+            ),
+        ),
+    )
+    with pytest.raises(WeaviateInvalidInputError):
+        collection.config.update(vector_index_config=vector_index_config)
+
+
 def test_update_flat(collection_factory: CollectionFactory) -> None:
     collection = collection_factory(
         vector_index_config=Configure.VectorIndex.flat(
@@ -581,14 +605,14 @@ def test_update_flat(collection_factory: CollectionFactory) -> None:
     assert isinstance(config.vector_index_config.quantizer, _BQConfig)
     assert config.vector_index_config.quantizer.rescore_limit == 20
 
-    # Cannot currently disabled BQ after it has been enabled
-    # collection.config.update(
-    #     vectorizer_config=Reconfigure.VectorIndex.flat(
-    #         quantizer=Reconfigure.VectorIndex.Quantizer.bq(enabled=False),
-    #     )
-    # )
-    # config = collection.config.get()
-    # assert config.vector_index_config.quantizer is None
+    with pytest.raises(UnexpectedStatusCodeError):
+        # cannot enable/disable BQ after flat index was created
+        # must only do this on creation
+        collection.config.update(
+            vectorizer_config=Reconfigure.VectorIndex.flat(
+                quantizer=Reconfigure.VectorIndex.Quantizer.bq(enabled=False),
+            )
+        )
 
 
 def test_collection_config_get_shards(collection_factory: CollectionFactory) -> None:

diff --git a/test/collection/schema.py b/test/collection/schema.py
@@ -0,0 +1,102 @@
+from typing import Literal, Optional
+
+
+def multi_vector_schema(quantizer: Optional[Literal["pq", "bq", "sq"]] = None) -> dict:
+    return {
+        "class": "Something",
+        "invertedIndexConfig": {
+            "bm25": {"b": 0.75, "k1": 1.2},
+            "cleanupIntervalSeconds": 60,
+            "stopwords": {"additions": None, "preset": "en", "removals": None},
+        },
+        "multiTenancyConfig": {
+            "autoTenantActivation": False,
+            "autoTenantCreation": False,
+            "enabled": False,
+        },
+        "properties": [
+            {
+                "dataType": ["text"],
+                "indexFilterable": True,
+                "indexRangeFilters": False,
+                "indexSearchable": True,
+                "name": "name",
+                "tokenization": "word",
+            }
+        ],
+        "replicationConfig": {"asyncEnabled": False, "factor": 1},
+        "shardingConfig": {
+            "virtualPerPhysical": 128,
+            "desiredCount": 1,
+            "actualCount": 1,
+            "desiredVirtualCount": 128,
+            "actualVirtualCount": 128,
+            "key": "_id",
+            "strategy": "hash",
+            "function": "murmur3",
+        },
+        "vectorConfig": {
+            "boi": {
+                "vectorIndexConfig": {
+                    "skip": False,
+                    "cleanupIntervalSeconds": 300,
+                    "maxConnections": 32,
+                    "efConstruction": 128,
+                    "ef": -1,
+                    "dynamicEfMin": 100,
+                    "dynamicEfMax": 500,
+                    "dynamicEfFactor": 8,
+                    "vectorCacheMaxObjects": 1000000000000,
+                    "flatSearchCutoff": 40000,
+                    "distance": "cosine",
+                    "pq": {
+                        "enabled": quantizer == "pq",
+                        "bitCompression": False,
+                        "segments": 0,
+                        "centroids": 256,
+                        "trainingLimit": 100000,
+                        "encoder": {"type": "kmeans", "distribution": "log-normal"},
+                    },
+                    "bq": {"enabled": quantizer == "bq"},
+                    "sq": {
+                        "enabled": quantizer == "sq",
+                        "trainingLimit": 100000,
+                        "rescoreLimit": 20,
+                    },
+                },
+                "vectorIndexType": "hnsw",
+                "vectorizer": {"none": {}},
+            },
+            "yeh": {
+                "vectorIndexConfig": {
+                    "skip": False,
+                    "cleanupIntervalSeconds": 300,
+                    "maxConnections": 32,
+                    "efConstruction": 128,
+                    "ef": -1,
+                    "dynamicEfMin": 100,
+                    "dynamicEfMax": 500,
+                    "dynamicEfFactor": 8,
+                    "vectorCacheMaxObjects": 1000000000000,
+                    "flatSearchCutoff": 40000,
+                    "distance": "cosine",
+                    "pq": {
+                        "enabled": quantizer == "pq",
+                        "bitCompression": False,
+                        "segments": 0,
+                        "centroids": 256,
+                        "trainingLimit": 100000,
+                        "encoder": {"type": "kmeans", "distribution": "log-normal"},
+                    },
+                    "bq": {"enabled": quantizer == "bq"},
+                    "sq": {
+                        "enabled": quantizer == "sq",
+                        "trainingLimit": 100000,
+                        "rescoreLimit": 20,
+                    },
+                },
+                "vectorIndexType": "hnsw",
+                "vectorizer": {"none": {}},
+            },
+        },
+    }
diff --git a/test/collection/test_config_update.py b/test/collection/test_config_update.py
@@ -0,0 +1,104 @@
+import pytest
+
+from test.collection.schema import multi_vector_schema
+from weaviate.collections.classes.config import _CollectionConfigUpdate, Reconfigure
+from weaviate.exceptions import WeaviateInvalidInputError
+
+
+@pytest.mark.parametrize(
+    "schema,should_error",
+    [
+        (multi_vector_schema(), False),
+        (multi_vector_schema("bq"), True),
+        (multi_vector_schema("sq"), True),
+    ],
+)
+def test_enabling_pq_multi_vector(schema: dict, should_error: bool) -> None:
+    update = _CollectionConfigUpdate(
+        vectorizer_config=[
+            Reconfigure.NamedVectors.update(
+                name="boi",
+                vector_index_config=Reconfigure.VectorIndex.hnsw(
+                    quantizer=Reconfigure.VectorIndex.Quantizer.pq()
+                ),
+            )
+        ]
+    )
+    if should_error:
+        with pytest.raises(WeaviateInvalidInputError):
+            update.merge_with_existing(schema)
+        return
+
+    new_schema = update.merge_with_existing(schema)
+
+    assert new_schema["vectorConfig"]["boi"]["vectorIndexConfig"]["pq"]["enabled"]
+    assert not new_schema["vectorConfig"]["boi"]["vectorIndexConfig"]["bq"]["enabled"]
+    assert not new_schema["vectorConfig"]["boi"]["vectorIndexConfig"]["sq"]["enabled"]
+
+    assert new_schema["vectorConfig"]["yeh"] == schema["vectorConfig"]["yeh"]
+
+
+@pytest.mark.parametrize(
+    "schema,should_error",
+    [
+        (multi_vector_schema(), False),
+        (multi_vector_schema("pq"), True),
+        (multi_vector_schema("sq"), True),
+    ],
+)
+def test_enabling_bq_multi_vector(schema: dict, should_error: bool) -> None:
+    update = _CollectionConfigUpdate(
+        vectorizer_config=[
+            Reconfigure.NamedVectors.update(
+                name="boi",
+                vector_index_config=Reconfigure.VectorIndex.hnsw(
+                    quantizer=Reconfigure.VectorIndex.Quantizer.bq()
+                ),
+            )
+        ]
+    )
+    if should_error:
+        with pytest.raises(WeaviateInvalidInputError):
+            update.merge_with_existing(schema)
+        return
+
+    new_schema = update.merge_with_existing(schema)
+
+    assert not new_schema["vectorConfig"]["boi"]["vectorIndexConfig"]["pq"]["enabled"]
+    assert new_schema["vectorConfig"]["boi"]["vectorIndexConfig"]["bq"]["enabled"]
+    assert not new_schema["vectorConfig"]["boi"]["vectorIndexConfig"]["sq"]["enabled"]
+
+    assert new_schema["vectorConfig"]["yeh"] == schema["vectorConfig"]["yeh"]
+
+
+@pytest.mark.parametrize(
+    "schema,should_error",
+    [
+        (multi_vector_schema(), False),
+        (multi_vector_schema("pq"), True),
+        (multi_vector_schema("bq"), True),
+    ],
+)
+def test_enabling_sq_multi_vector(schema: dict, should_error: bool) -> None:
+    update = _CollectionConfigUpdate(
+        vectorizer_config=[
+            Reconfigure.NamedVectors.update(
+                name="boi",
+                vector_index_config=Reconfigure.VectorIndex.hnsw(
+                    quantizer=Reconfigure.VectorIndex.Quantizer.sq()
+                ),
+            )
+        ]
+    )
+    if should_error:
+        with pytest.raises(WeaviateInvalidInputError):
+            update.merge_with_existing(schema)
+        return
+
+    new_schema = update.merge_with_existing(schema)
+
+    assert not new_schema["vectorConfig"]["boi"]["vectorIndexConfig"]["pq"]["enabled"]
+    assert not new_schema["vectorConfig"]["boi"]["vectorIndexConfig"]["bq"]["enabled"]
+    assert new_schema["vectorConfig"]["boi"]["vectorIndexConfig"]["sq"]["enabled"]
+
+    assert new_schema["vectorConfig"]["yeh"] == schema["vectorConfig"]["yeh"]
diff --git a/weaviate/collections/classes/config.py b/weaviate/collections/classes/config.py
@@ -308,6 +308,7 @@ def quantizer_name() -> str:
 
 
 class _BQConfigUpdate(_QuantizerConfigUpdate):
+    enabled: Optional[bool]
     rescoreLimit: Optional[int]
 
     @staticmethod
@@ -995,6 +996,39 @@ class _CollectionConfigUpdate(_ConfigUpdateModel):
         default=None, alias="multi_tenancy_config"
     )
 
+    def __check_quantizers(
+        self,
+        quantizer: Optional[_QuantizerConfigUpdate],
+        vector_index_config: dict,
+    ) -> None:
+        if (
+            (
+                isinstance(quantizer, _PQConfigUpdate)
+                and (
+                    vector_index_config.get("bq", {"enabled": False})["enabled"]
+                    or vector_index_config.get("sq", {"enabled": False})["enabled"]
+                )
+            )
+            or (
+                isinstance(quantizer, _BQConfigUpdate)
+                and (
+                    vector_index_config["pq"]["enabled"]
+                    or vector_index_config.get("sq", {"enabled": False})["enabled"]
+                )
+            )
+            or (
+                isinstance(quantizer, _SQConfigUpdate)
+                and (
+                    vector_index_config["pq"]["enabled"]
+                    or vector_index_config.get("bq", {"enabled": False})["enabled"]
+                )
+            )
+        ):
+            raise WeaviateInvalidInputError(
+                f"Cannot update vector index config {vector_index_config} to change its quantizer. To do this, you must recreate the collection."
+            )
+        return None
+
     def merge_with_existing(self, schema: Dict[str, Any]) -> Dict[str, Any]:
         if self.description is not None:
             schema["description"] = self.description
@@ -1011,11 +1045,15 @@ def merge_with_existing(self, schema: Dict[str, Any]) -> Dict[str, Any]:
                 schema["multiTenancyConfig"]
             )
         if self.vectorIndexConfig is not None:
+            self.__check_quantizers(self.vectorIndexConfig.quantizer, schema["vectorIndexConfig"])
             schema["vectorIndexConfig"] = self.vectorIndexConfig.merge_with_existing(
                 schema["vectorIndexConfig"]
             )
         if self.vectorizerConfig is not None:
             if isinstance(self.vectorizerConfig, _VectorIndexConfigUpdate):
+                self.__check_quantizers(
+                    self.vectorizerConfig.quantizer, schema["vectorIndexConfig"]
+                )
                 schema["vectorIndexConfig"] = self.vectorizerConfig.merge_with_existing(
                     schema["vectorIndexConfig"]
                 )
@@ -1025,18 +1063,10 @@ def merge_with_existing(self, schema: Dict[str, Any]) -> Dict[str, Any]:
                         raise WeaviateInvalidInputError(
                             f"Vector config with name {vc.name} does not exist in the existing vector config"
                         )
-                    if (
-                        isinstance(vc.vectorIndexConfig.quantizer, _PQConfigUpdate)
-                        and schema["vectorConfig"][vc.name]["vectorIndexConfig"]["bq"]["enabled"]
-                        is True
-                    ) or (
-                        isinstance(vc.vectorIndexConfig.quantizer, _BQConfigUpdate)
-                        and schema["vectorConfig"][vc.name]["vectorIndexConfig"]["pq"]["enabled"]
-                        is True
-                    ):
-                        raise WeaviateInvalidInputError(
-                            f"Cannot update vector index config with name {vc.name} to change its quantizer"
-                        )
+                    self.__check_quantizers(
+                        vc.vectorIndexConfig.quantizer,
+                        schema["vectorConfig"][vc.name]["vectorIndexConfig"],
+                    )
                     schema["vectorConfig"][vc.name][
                         "vectorIndexConfig"
                     ] = vc.vectorIndexConfig.merge_with_existing(
@@ -2035,15 +2065,15 @@ def pq(
         )
 
     @staticmethod
-    def bq(rescore_limit: Optional[int] = None) -> _BQConfigUpdate:
+    def bq(rescore_limit: Optional[int] = None, enabled: bool = True) -> _BQConfigUpdate:
         """Create a `_BQConfigUpdate` object to be used when updating the binary quantization (BQ) configuration of Weaviate.
 
         Use this method when defining the `quantizer` argument in the `vector_index` configuration in `collection.update()`.
 
         Arguments:
             See [the docs](https://weaviate.io/developers/weaviate/concepts/vector-index#hnsw-with-compression) for a more detailed view!
         """  # noqa: D417 (missing argument descriptions in the docstring)
-        return _BQConfigUpdate(rescoreLimit=rescore_limit)
+        return _BQConfigUpdate(rescoreLimit=rescore_limit, enabled=enabled)
 
     @staticmethod
     def sq(

diff --git a/weaviate/collections/classes/config_base.py b/weaviate/collections/classes/config_base.py
@@ -26,7 +26,15 @@ def merge_with_existing(self, schema: Dict[str, Any]) -> Dict[str, Any]:
             elif isinstance(val, (int, float, bool, str, list)):
                 schema[cls_field] = val
             elif isinstance(val, _QuantizerConfigUpdate):
+                quantizers = ["pq", "bq", "sq"]
                 schema[val.quantizer_name()] = val.merge_with_existing(schema[val.quantizer_name()])
+                for quantizer in quantizers:
+                    if quantizer == val.quantizer_name() or quantizer not in schema:
+                        continue
+                    assert (
+                        "enabled" in schema[quantizer]
+                    ), f"Quantizer {quantizer} does not have the enabled field: {schema}"
+                    schema[quantizer]["enabled"] = False
             elif isinstance(val, _ConfigUpdateModel):
                 schema[cls_field] = val.merge_with_existing(schema[cls_field])
             else: