diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 32a5f5b..db50c47 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -14,7 +14,6 @@ jobs: strategy: matrix: python-version: - - '3.8' - '3.9' - '3.10' - '3.11' @@ -42,6 +41,8 @@ jobs: python-version: '${{ matrix.python-version }}' cache: pip - name: Install Python test dependencies - run: pip install -r src/test/python/requirements.txt + run: | + pip install --upgrade pip + pip install -r src/test/python/requirements.txt - name: Run Python tests run: pytest diff --git a/.releaserc b/.releaserc index 080657a..24223a5 100644 --- a/.releaserc +++ b/.releaserc @@ -68,73 +68,6 @@ ], "message": "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}" } - ], - [ - "@semantic-release/release-notes-generator", - { - "preset": "conventionalcommits", - "parserOpts": { - "noteKeywords": [ - "BREAKING CHANGE", - "BREAKING CHANGES", - "BREAKING" - ] - }, - "writerOpts": { - "commitsSort": [ - "subject", - "scope" - ] - }, - "presetConfig": { - "types": [ - { - "type": "feat", - "section": "๐ŸŒž Features" - }, - { - "type": "fix", - "section": "๐Ÿ› Bug Fixes" - }, - { - "type": "perf", - "section": "๐Ÿš€ Performance Improvements" - }, - { - "type": "revert", - "section": "โฉ Reverts" - }, - { - "type": "docs", - "section": "๐Ÿ“ Documentation" - }, - { - "type": "style", - "section": "๐ŸŽจ Styles" - }, - { - "type": "refactor", - "section": "๐Ÿง‘โ€๐Ÿ’ป Code Refactoring" - }, - { - "type": "test", - "section": "โœ… Tests" - }, - { - "type": "build", - "section": "๐Ÿค– Build System" - }, - { - "type": "ci", - "section": "๐Ÿ” Continuous Integration" - }, - { - "type": "chore", - "section": "๐Ÿงน Chores" - } - ] - } - } ] ] } \ No newline at end of file diff --git a/README.md b/README.md index 4976d03..c439b34 100644 --- a/README.md +++ b/README.md @@ -192,7 +192,8 @@ You can use the connector as a library in Databricks to ingest data into Qdrant. - Select `Install New` to open the library installation modal. - Search for `io.qdrant:spark:VERSION` in the Maven packages and click `Install`. -Screenshot 2024-01-05 at 17 20 01 (1) +Screenshot 2024-04-28 at 11 34 17โ€ฏAM + ## Datatype support diff --git a/pom.xml b/pom.xml index 279cd0c..a613dad 100644 --- a/pom.xml +++ b/pom.xml @@ -39,12 +39,12 @@ io.qdrant client - 1.8.0 + 1.9.0 com.google.guava guava - 32.0.0-jre + 33.1.0-jre io.grpc @@ -56,7 +56,7 @@ org.slf4j slf4j-api - 2.0.7 + 2.0.13 org.apache.spark @@ -75,13 +75,13 @@ org.testcontainers qdrant - 1.19.6 + 1.19.7 test org.testcontainers junit-jupiter - 1.19.4 + 1.19.7 test diff --git a/src/test/python/conftest.py b/src/test/python/conftest.py index ca7c8dc..11ae480 100644 --- a/src/test/python/conftest.py +++ b/src/test/python/conftest.py @@ -1,24 +1,26 @@ import pytest -from testcontainers.core.container import DockerContainer # type: ignore -from testcontainers.core.waiting_utils import wait_for_logs # type: ignore +from testcontainers.qdrant import QdrantContainer from qdrant_client import QdrantClient, models import uuid from pyspark.sql import SparkSession from typing import NamedTuple +from uuid import uuid4 QDRANT_GRPC_PORT = 6334 QDRANT_EMBEDDING_DIM = 6 QDRANT_DISTANCE = models.Distance.COSINE +QDRANT_API_KEY = uuid4().hex class Qdrant(NamedTuple): url: str + api_key: str collection_name: str client: QdrantClient -qdrant_container = DockerContainer("qdrant/qdrant").with_exposed_ports(QDRANT_GRPC_PORT) +qdrant_container = QdrantContainer(image="qdrant/qdrant:latest", api_key=QDRANT_API_KEY) # Reference: https://gist.github.com/dizzythinks/f3bb37fd8ab1484bfec79d39ad8a92d3 @@ -36,9 +38,6 @@ def get_pom_version(): @pytest.fixture(scope="module", autouse=True) def setup_container(request): qdrant_container.start() - wait_for_logs( - qdrant_container, ".*Actix runtime found; starting in Actix runtime.*", 60 - ) def remove_container(): qdrant_container.stop() @@ -70,6 +69,8 @@ def qdrant(): host=host, grpc_port=grpc_port, prefer_grpc=True, + api_key=QDRANT_API_KEY, + https=False, ) collection_name = str(uuid.uuid4()) @@ -99,6 +100,7 @@ def qdrant(): url=f"http://{host}:{grpc_port}", client=client, collection_name=collection_name, + api_key=QDRANT_API_KEY, ) return client.close() diff --git a/src/test/python/requirements.txt b/src/test/python/requirements.txt index 4758268..f1f579b 100644 --- a/src/test/python/requirements.txt +++ b/src/test/python/requirements.txt @@ -1,4 +1,4 @@ pyspark==3.5.1 -pytest==8.0.2 -qdrant-client==1.7.3 -testcontainers==3.7.1 +pytest==8.2.0 +qdrant-client==1.9.0 +testcontainers==4.4.0 diff --git a/src/test/python/test_qdrant_ingest.py b/src/test/python/test_qdrant_ingest.py index 4b389a9..3bec46b 100644 --- a/src/test/python/test_qdrant_ingest.py +++ b/src/test/python/test_qdrant_ingest.py @@ -5,7 +5,7 @@ from .conftest import Qdrant current_directory = os.path.dirname(__file__) -input_file_path = os.path.join(current_directory, '..', 'resources', 'users.json') +input_file_path = os.path.join(current_directory, "..", "resources", "users.json") def test_upsert_unnamed_vectors(qdrant: Qdrant, spark_session: SparkSession): @@ -14,12 +14,15 @@ def test_upsert_unnamed_vectors(qdrant: Qdrant, spark_session: SparkSession): .option("multiline", "true") .json(str(input_file_path)) ) - df.write.format("io.qdrant.spark.Qdrant").option( - "qdrant_url", - qdrant.url, - ).option("collection_name", qdrant.collection_name).option( - "embedding_field", "dense_vector" - ).mode("append").option("schema", df.schema.json()).save() + opts = { + "qdrant_url": qdrant.url, + "collection_name": qdrant.collection_name, + "embedding_field": "dense_vector", + "api_key": qdrant.api_key, + "schema": df.schema.json(), + } + + df.write.format("io.qdrant.spark.Qdrant").options(**opts).mode("append").save() assert ( qdrant.client.count(qdrant.collection_name).count == df.count() @@ -32,14 +35,15 @@ def test_upsert_named_vectors(qdrant: Qdrant, spark_session: SparkSession): .option("multiline", "true") .json(str(input_file_path)) ) - df.write.format("io.qdrant.spark.Qdrant").option( - "qdrant_url", - qdrant.url, - ).option("collection_name", qdrant.collection_name).option( - "embedding_field", "dense_vector" - ).option("vector_name", "dense").option("schema", df.schema.json()).mode( - "append" - ).save() + opts = { + "qdrant_url": qdrant.url, + "collection_name": qdrant.collection_name, + "vector_name": "dense", + "schema": df.schema.json(), + "api_key": qdrant.api_key, + } + + df.write.format("io.qdrant.spark.Qdrant").options(**opts).mode("append").save() assert ( qdrant.client.count(qdrant.collection_name).count == df.count() @@ -54,14 +58,16 @@ def test_upsert_multiple_named_dense_vectors( .option("multiline", "true") .json(str(input_file_path)) ) - df.write.format("io.qdrant.spark.Qdrant").option( - "qdrant_url", - qdrant.url, - ).option("collection_name", qdrant.collection_name).option( - "vector_fields", "dense_vector,dense_vector" - ).option("vector_names", "dense,another_dense").option( - "schema", df.schema.json() - ).mode("append").save() + opts = { + "qdrant_url": qdrant.url, + "collection_name": qdrant.collection_name, + "vector_fields": "dense_vector,dense_vector", + "vector_names": "dense,another_dense", + "schema": df.schema.json(), + "api_key": qdrant.api_key, + } + + df.write.format("io.qdrant.spark.Qdrant").options(**opts).mode("append").save() assert ( qdrant.client.count(qdrant.collection_name).count == df.count() @@ -74,14 +80,17 @@ def test_upsert_sparse_vectors(qdrant: Qdrant, spark_session: SparkSession): .option("multiline", "true") .json(str(input_file_path)) ) - df.write.format("io.qdrant.spark.Qdrant").option( - "qdrant_url", - qdrant.url, - ).option("collection_name", qdrant.collection_name).option( - "sparse_vector_value_fields", "sparse_values" - ).option("sparse_vector_index_fields", "sparse_indices").option( - "sparse_vector_names", "sparse" - ).option("schema", df.schema.json()).mode("append").save() + opts = { + "qdrant_url": qdrant.url, + "collection_name": qdrant.collection_name, + "sparse_vector_value_fields": "sparse_values", + "sparse_vector_index_fields": "sparse_indices", + "sparse_vector_names": "sparse", + "schema": df.schema.json(), + "api_key": qdrant.api_key, + } + + df.write.format("io.qdrant.spark.Qdrant").options(**opts).mode("append").save() assert ( qdrant.client.count(qdrant.collection_name).count == df.count() @@ -94,14 +103,17 @@ def test_upsert_multiple_sparse_vectors(qdrant: Qdrant, spark_session: SparkSess .option("multiline", "true") .json(str(input_file_path)) ) - df.write.format("io.qdrant.spark.Qdrant").option( - "qdrant_url", - qdrant.url, - ).option("collection_name", qdrant.collection_name).option( - "sparse_vector_value_fields", "sparse_values,sparse_values" - ).option("sparse_vector_index_fields", "sparse_indices,sparse_indices").option( - "sparse_vector_names", "sparse,another_sparse" - ).option("schema", df.schema.json()).mode("append").save() + opts = { + "qdrant_url": qdrant.url, + "collection_name": qdrant.collection_name, + "sparse_vector_value_fields": "sparse_values,sparse_values", + "sparse_vector_index_fields": "sparse_indices,sparse_indices", + "sparse_vector_names": "sparse,another_sparse", + "schema": df.schema.json(), + "api_key": qdrant.api_key, + } + + df.write.format("io.qdrant.spark.Qdrant").options(**opts).mode("append").save() assert ( qdrant.client.count(qdrant.collection_name).count == df.count() @@ -114,16 +126,19 @@ def test_upsert_sparse_named_dense_vectors(qdrant: Qdrant, spark_session: SparkS .option("multiline", "true") .json(str(input_file_path)) ) - df.write.format("io.qdrant.spark.Qdrant").option( - "qdrant_url", - qdrant.url, - ).option("collection_name", qdrant.collection_name).option( - "vector_fields", "dense_vector" - ).option("vector_names", "dense").option( - "sparse_vector_value_fields", "sparse_values" - ).option("sparse_vector_index_fields", "sparse_indices").option( - "sparse_vector_names", "sparse" - ).option("schema", df.schema.json()).mode("append").save() + opts = { + "qdrant_url": qdrant.url, + "collection_name": qdrant.collection_name, + "embedding_field": "dense_vector", + "vector_name": "dense", + "sparse_vector_value_fields": "sparse_values", + "sparse_vector_index_fields": "sparse_indices", + "sparse_vector_names": "sparse", + "schema": df.schema.json(), + "api_key": qdrant.api_key, + } + + df.write.format("io.qdrant.spark.Qdrant").options(**opts).mode("append").save() assert ( qdrant.client.count(qdrant.collection_name).count == df.count() @@ -138,16 +153,18 @@ def test_upsert_sparse_unnamed_dense_vectors( .option("multiline", "true") .json(str(input_file_path)) ) - df.write.format("io.qdrant.spark.Qdrant").option( - "qdrant_url", - qdrant.url, - ).option("collection_name", qdrant.collection_name).option( - "embedding_field", "dense_vector" - ).option("sparse_vector_value_fields", "sparse_values").option( - "sparse_vector_index_fields", "sparse_indices" - ).option("sparse_vector_names", "sparse").option("schema", df.schema.json()).mode( - "append" - ).save() + opts = { + "qdrant_url": qdrant.url, + "collection_name": qdrant.collection_name, + "embedding_field": "dense_vector", + "sparse_vector_value_fields": "sparse_values", + "sparse_vector_index_fields": "sparse_indices", + "sparse_vector_names": "sparse", + "schema": df.schema.json(), + "api_key": qdrant.api_key, + } + + df.write.format("io.qdrant.spark.Qdrant").options(**opts).mode("append").save() assert ( qdrant.client.count(qdrant.collection_name).count == df.count() @@ -162,16 +179,19 @@ def test_upsert_multiple_sparse_dense_vectors( .option("multiline", "true") .json(str(input_file_path)) ) - df.write.format("io.qdrant.spark.Qdrant").option( - "qdrant_url", - qdrant.url, - ).option("collection_name", qdrant.collection_name).option( - "vector_fields", "dense_vector,dense_vector" - ).option("vector_names", "dense,another_dense").option( - "sparse_vector_value_fields", "sparse_values,sparse_values" - ).option("sparse_vector_index_fields", "sparse_indices,sparse_indices").option( - "sparse_vector_names", "sparse,another_sparse" - ).option("schema", df.schema.json()).mode("append").save() + opts = { + "qdrant_url": qdrant.url, + "collection_name": qdrant.collection_name, + "embedding_field": "dense_vector", + "vector_name": "dense", + "sparse_vector_value_fields": "sparse_values,sparse_values", + "sparse_vector_index_fields": "sparse_indices,sparse_indices", + "sparse_vector_names": "sparse,another_sparse", + "schema": df.schema.json(), + "api_key": qdrant.api_key, + } + + df.write.format("io.qdrant.spark.Qdrant").options(**opts).mode("append").save() assert ( qdrant.client.count(qdrant.collection_name).count == df.count() @@ -185,12 +205,13 @@ def test_upsert_without_vectors(qdrant: Qdrant, spark_session: SparkSession): .option("multiline", "true") .json(str(input_file_path)) ) - df.write.format("io.qdrant.spark.Qdrant").option( - "qdrant_url", - qdrant.url, - ).option("collection_name", qdrant.collection_name).option( - "schema", df.schema.json() - ).mode("append").save() + opts = { + "qdrant_url": qdrant.url, + "collection_name": qdrant.collection_name, + "schema": df.schema.json(), + "api_key": qdrant.api_key, + } + df.write.format("io.qdrant.spark.Qdrant").options(**opts).mode("append").save() assert ( qdrant.client.count(qdrant.collection_name).count == df.count() @@ -204,18 +225,15 @@ def test_custom_id_field(qdrant: Qdrant, spark_session: SparkSession): .json(str(input_file_path)) ) - df = ( - spark_session.read.schema(schema) - .option("multiline", "true") - .json(str(input_file_path)) - ) - df.write.format("io.qdrant.spark.Qdrant").option( - "qdrant_url", - qdrant.url, - ).option("collection_name", qdrant.collection_name).option( - "embedding_field", "dense_vector" - ).option("schema", df.schema.json()).option("vector_name", "dense").option( - "id_field", "id" - ).mode("append").save() + opts = { + "qdrant_url": qdrant.url, + "collection_name": qdrant.collection_name, + "embedding_field": "dense_vector", + "vector_name": "dense", + "id_field": "id", + "schema": df.schema.json(), + "api_key": qdrant.api_key, + } + df.write.format("io.qdrant.spark.Qdrant").options(**opts).mode("append").save() assert len(qdrant.client.retrieve(qdrant.collection_name, [1, 2, 3, 15, 18])) == 5