Skip to content

Commit

Permalink
Merge pull request #1171 from weaviate/1.26/fix-updating-quantizers
Browse files Browse the repository at this point in the history
Fix updating quantizers between `pq`, `bq`, and `sq` with `hnsw` index
  • Loading branch information
tsmith023 authored Jul 9, 2024
2 parents e1343b6 + 772c25e commit dec088a
Show file tree
Hide file tree
Showing 5 changed files with 290 additions and 22 deletions.
40 changes: 32 additions & 8 deletions integration/test_collection_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
_VectorIndexConfigDynamic,
_VectorIndexConfigFlat,
_VectorIndexConfigHNSW,
_VectorIndexConfigHNSWUpdate,
Configure,
Reconfigure,
Property,
Expand All @@ -31,6 +32,7 @@
_RerankerConfigCreate,
)
from weaviate.collections.classes.tenants import Tenant
from weaviate.exceptions import UnexpectedStatusCodeError, WeaviateInvalidInputError


@pytest.fixture(scope="module")
Expand Down Expand Up @@ -551,6 +553,28 @@ def test_hnsw_with_sq(collection_factory: CollectionFactory) -> None:
assert isinstance(config.vector_index_config.quantizer, _SQConfig)


@pytest.mark.parametrize(
"vector_index_config",
[
Reconfigure.VectorIndex.hnsw(quantizer=Reconfigure.VectorIndex.Quantizer.bq()),
Reconfigure.VectorIndex.hnsw(quantizer=Reconfigure.VectorIndex.Quantizer.sq()),
],
)
def test_update_from_pq_with_hnsw(
collection_factory: CollectionFactory, vector_index_config: _VectorIndexConfigHNSWUpdate
) -> None:
collection = collection_factory(
vector_index_config=Configure.VectorIndex.hnsw(
vector_cache_max_objects=5,
quantizer=Configure.VectorIndex.Quantizer.pq(
centroids=128,
),
),
)
with pytest.raises(WeaviateInvalidInputError):
collection.config.update(vector_index_config=vector_index_config)


def test_update_flat(collection_factory: CollectionFactory) -> None:
collection = collection_factory(
vector_index_config=Configure.VectorIndex.flat(
Expand Down Expand Up @@ -581,14 +605,14 @@ def test_update_flat(collection_factory: CollectionFactory) -> None:
assert isinstance(config.vector_index_config.quantizer, _BQConfig)
assert config.vector_index_config.quantizer.rescore_limit == 20

# Cannot currently disabled BQ after it has been enabled
# collection.config.update(
# vectorizer_config=Reconfigure.VectorIndex.flat(
# quantizer=Reconfigure.VectorIndex.Quantizer.bq(enabled=False),
# )
# )
# config = collection.config.get()
# assert config.vector_index_config.quantizer is None
with pytest.raises(UnexpectedStatusCodeError):
# cannot enable/disable BQ after flat index was created
# must only do this on creation
collection.config.update(
vectorizer_config=Reconfigure.VectorIndex.flat(
quantizer=Reconfigure.VectorIndex.Quantizer.bq(enabled=False),
)
)


def test_collection_config_get_shards(collection_factory: CollectionFactory) -> None:
Expand Down
102 changes: 102 additions & 0 deletions test/collection/schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from typing import Literal, Optional


def multi_vector_schema(quantizer: Optional[Literal["pq", "bq", "sq"]] = None) -> dict:
return {
"class": "Something",
"invertedIndexConfig": {
"bm25": {"b": 0.75, "k1": 1.2},
"cleanupIntervalSeconds": 60,
"stopwords": {"additions": None, "preset": "en", "removals": None},
},
"multiTenancyConfig": {
"autoTenantActivation": False,
"autoTenantCreation": False,
"enabled": False,
},
"properties": [
{
"dataType": ["text"],
"indexFilterable": True,
"indexRangeFilters": False,
"indexSearchable": True,
"name": "name",
"tokenization": "word",
}
],
"replicationConfig": {"asyncEnabled": False, "factor": 1},
"shardingConfig": {
"virtualPerPhysical": 128,
"desiredCount": 1,
"actualCount": 1,
"desiredVirtualCount": 128,
"actualVirtualCount": 128,
"key": "_id",
"strategy": "hash",
"function": "murmur3",
},
"vectorConfig": {
"boi": {
"vectorIndexConfig": {
"skip": False,
"cleanupIntervalSeconds": 300,
"maxConnections": 32,
"efConstruction": 128,
"ef": -1,
"dynamicEfMin": 100,
"dynamicEfMax": 500,
"dynamicEfFactor": 8,
"vectorCacheMaxObjects": 1000000000000,
"flatSearchCutoff": 40000,
"distance": "cosine",
"pq": {
"enabled": quantizer == "pq",
"bitCompression": False,
"segments": 0,
"centroids": 256,
"trainingLimit": 100000,
"encoder": {"type": "kmeans", "distribution": "log-normal"},
},
"bq": {"enabled": quantizer == "bq"},
"sq": {
"enabled": quantizer == "sq",
"trainingLimit": 100000,
"rescoreLimit": 20,
},
},
"vectorIndexType": "hnsw",
"vectorizer": {"none": {}},
},
"yeh": {
"vectorIndexConfig": {
"skip": False,
"cleanupIntervalSeconds": 300,
"maxConnections": 32,
"efConstruction": 128,
"ef": -1,
"dynamicEfMin": 100,
"dynamicEfMax": 500,
"dynamicEfFactor": 8,
"vectorCacheMaxObjects": 1000000000000,
"flatSearchCutoff": 40000,
"distance": "cosine",
"pq": {
"enabled": quantizer == "pq",
"bitCompression": False,
"segments": 0,
"centroids": 256,
"trainingLimit": 100000,
"encoder": {"type": "kmeans", "distribution": "log-normal"},
},
"bq": {"enabled": quantizer == "bq"},
"sq": {
"enabled": quantizer == "sq",
"trainingLimit": 100000,
"rescoreLimit": 20,
},
},
"vectorIndexType": "hnsw",
"vectorizer": {"none": {}},
},
},
}
104 changes: 104 additions & 0 deletions test/collection/test_config_update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import pytest

from test.collection.schema import multi_vector_schema
from weaviate.collections.classes.config import _CollectionConfigUpdate, Reconfigure
from weaviate.exceptions import WeaviateInvalidInputError


@pytest.mark.parametrize(
"schema,should_error",
[
(multi_vector_schema(), False),
(multi_vector_schema("bq"), True),
(multi_vector_schema("sq"), True),
],
)
def test_enabling_pq_multi_vector(schema: dict, should_error: bool) -> None:
update = _CollectionConfigUpdate(
vectorizer_config=[
Reconfigure.NamedVectors.update(
name="boi",
vector_index_config=Reconfigure.VectorIndex.hnsw(
quantizer=Reconfigure.VectorIndex.Quantizer.pq()
),
)
]
)
if should_error:
with pytest.raises(WeaviateInvalidInputError):
update.merge_with_existing(schema)
return

new_schema = update.merge_with_existing(schema)

assert new_schema["vectorConfig"]["boi"]["vectorIndexConfig"]["pq"]["enabled"]
assert not new_schema["vectorConfig"]["boi"]["vectorIndexConfig"]["bq"]["enabled"]
assert not new_schema["vectorConfig"]["boi"]["vectorIndexConfig"]["sq"]["enabled"]

assert new_schema["vectorConfig"]["yeh"] == schema["vectorConfig"]["yeh"]


@pytest.mark.parametrize(
"schema,should_error",
[
(multi_vector_schema(), False),
(multi_vector_schema("pq"), True),
(multi_vector_schema("sq"), True),
],
)
def test_enabling_bq_multi_vector(schema: dict, should_error: bool) -> None:
update = _CollectionConfigUpdate(
vectorizer_config=[
Reconfigure.NamedVectors.update(
name="boi",
vector_index_config=Reconfigure.VectorIndex.hnsw(
quantizer=Reconfigure.VectorIndex.Quantizer.bq()
),
)
]
)
if should_error:
with pytest.raises(WeaviateInvalidInputError):
update.merge_with_existing(schema)
return

new_schema = update.merge_with_existing(schema)

assert not new_schema["vectorConfig"]["boi"]["vectorIndexConfig"]["pq"]["enabled"]
assert new_schema["vectorConfig"]["boi"]["vectorIndexConfig"]["bq"]["enabled"]
assert not new_schema["vectorConfig"]["boi"]["vectorIndexConfig"]["sq"]["enabled"]

assert new_schema["vectorConfig"]["yeh"] == schema["vectorConfig"]["yeh"]


@pytest.mark.parametrize(
"schema,should_error",
[
(multi_vector_schema(), False),
(multi_vector_schema("pq"), True),
(multi_vector_schema("bq"), True),
],
)
def test_enabling_sq_multi_vector(schema: dict, should_error: bool) -> None:
update = _CollectionConfigUpdate(
vectorizer_config=[
Reconfigure.NamedVectors.update(
name="boi",
vector_index_config=Reconfigure.VectorIndex.hnsw(
quantizer=Reconfigure.VectorIndex.Quantizer.sq()
),
)
]
)
if should_error:
with pytest.raises(WeaviateInvalidInputError):
update.merge_with_existing(schema)
return

new_schema = update.merge_with_existing(schema)

assert not new_schema["vectorConfig"]["boi"]["vectorIndexConfig"]["pq"]["enabled"]
assert not new_schema["vectorConfig"]["boi"]["vectorIndexConfig"]["bq"]["enabled"]
assert new_schema["vectorConfig"]["boi"]["vectorIndexConfig"]["sq"]["enabled"]

assert new_schema["vectorConfig"]["yeh"] == schema["vectorConfig"]["yeh"]
58 changes: 44 additions & 14 deletions weaviate/collections/classes/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,7 @@ def quantizer_name() -> str:


class _BQConfigUpdate(_QuantizerConfigUpdate):
enabled: Optional[bool]
rescoreLimit: Optional[int]

@staticmethod
Expand Down Expand Up @@ -995,6 +996,39 @@ class _CollectionConfigUpdate(_ConfigUpdateModel):
default=None, alias="multi_tenancy_config"
)

def __check_quantizers(
self,
quantizer: Optional[_QuantizerConfigUpdate],
vector_index_config: dict,
) -> None:
if (
(
isinstance(quantizer, _PQConfigUpdate)
and (
vector_index_config.get("bq", {"enabled": False})["enabled"]
or vector_index_config.get("sq", {"enabled": False})["enabled"]
)
)
or (
isinstance(quantizer, _BQConfigUpdate)
and (
vector_index_config["pq"]["enabled"]
or vector_index_config.get("sq", {"enabled": False})["enabled"]
)
)
or (
isinstance(quantizer, _SQConfigUpdate)
and (
vector_index_config["pq"]["enabled"]
or vector_index_config.get("bq", {"enabled": False})["enabled"]
)
)
):
raise WeaviateInvalidInputError(
f"Cannot update vector index config {vector_index_config} to change its quantizer. To do this, you must recreate the collection."
)
return None

def merge_with_existing(self, schema: Dict[str, Any]) -> Dict[str, Any]:
if self.description is not None:
schema["description"] = self.description
Expand All @@ -1011,11 +1045,15 @@ def merge_with_existing(self, schema: Dict[str, Any]) -> Dict[str, Any]:
schema["multiTenancyConfig"]
)
if self.vectorIndexConfig is not None:
self.__check_quantizers(self.vectorIndexConfig.quantizer, schema["vectorIndexConfig"])
schema["vectorIndexConfig"] = self.vectorIndexConfig.merge_with_existing(
schema["vectorIndexConfig"]
)
if self.vectorizerConfig is not None:
if isinstance(self.vectorizerConfig, _VectorIndexConfigUpdate):
self.__check_quantizers(
self.vectorizerConfig.quantizer, schema["vectorIndexConfig"]
)
schema["vectorIndexConfig"] = self.vectorizerConfig.merge_with_existing(
schema["vectorIndexConfig"]
)
Expand All @@ -1025,18 +1063,10 @@ def merge_with_existing(self, schema: Dict[str, Any]) -> Dict[str, Any]:
raise WeaviateInvalidInputError(
f"Vector config with name {vc.name} does not exist in the existing vector config"
)
if (
isinstance(vc.vectorIndexConfig.quantizer, _PQConfigUpdate)
and schema["vectorConfig"][vc.name]["vectorIndexConfig"]["bq"]["enabled"]
is True
) or (
isinstance(vc.vectorIndexConfig.quantizer, _BQConfigUpdate)
and schema["vectorConfig"][vc.name]["vectorIndexConfig"]["pq"]["enabled"]
is True
):
raise WeaviateInvalidInputError(
f"Cannot update vector index config with name {vc.name} to change its quantizer"
)
self.__check_quantizers(
vc.vectorIndexConfig.quantizer,
schema["vectorConfig"][vc.name]["vectorIndexConfig"],
)
schema["vectorConfig"][vc.name][
"vectorIndexConfig"
] = vc.vectorIndexConfig.merge_with_existing(
Expand Down Expand Up @@ -2035,15 +2065,15 @@ def pq(
)

@staticmethod
def bq(rescore_limit: Optional[int] = None) -> _BQConfigUpdate:
def bq(rescore_limit: Optional[int] = None, enabled: bool = True) -> _BQConfigUpdate:
"""Create a `_BQConfigUpdate` object to be used when updating the binary quantization (BQ) configuration of Weaviate.
Use this method when defining the `quantizer` argument in the `vector_index` configuration in `collection.update()`.
Arguments:
See [the docs](https://weaviate.io/developers/weaviate/concepts/vector-index#hnsw-with-compression) for a more detailed view!
""" # noqa: D417 (missing argument descriptions in the docstring)
return _BQConfigUpdate(rescoreLimit=rescore_limit)
return _BQConfigUpdate(rescoreLimit=rescore_limit, enabled=enabled)

@staticmethod
def sq(
Expand Down
8 changes: 8 additions & 0 deletions weaviate/collections/classes/config_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,15 @@ def merge_with_existing(self, schema: Dict[str, Any]) -> Dict[str, Any]:
elif isinstance(val, (int, float, bool, str, list)):
schema[cls_field] = val
elif isinstance(val, _QuantizerConfigUpdate):
quantizers = ["pq", "bq", "sq"]
schema[val.quantizer_name()] = val.merge_with_existing(schema[val.quantizer_name()])
for quantizer in quantizers:
if quantizer == val.quantizer_name() or quantizer not in schema:
continue
assert (
"enabled" in schema[quantizer]
), f"Quantizer {quantizer} does not have the enabled field: {schema}"
schema[quantizer]["enabled"] = False
elif isinstance(val, _ConfigUpdateModel):
schema[cls_field] = val.merge_with_existing(schema[cls_field])
else:
Expand Down

0 comments on commit dec088a

Please sign in to comment.