diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 32a5f5b..db50c47 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -14,7 +14,6 @@ jobs:
strategy:
matrix:
python-version:
- - '3.8'
- '3.9'
- '3.10'
- '3.11'
@@ -42,6 +41,8 @@ jobs:
python-version: '${{ matrix.python-version }}'
cache: pip
- name: Install Python test dependencies
- run: pip install -r src/test/python/requirements.txt
+ run: |
+ pip install --upgrade pip
+ pip install -r src/test/python/requirements.txt
- name: Run Python tests
run: pytest
diff --git a/.releaserc b/.releaserc
index 080657a..24223a5 100644
--- a/.releaserc
+++ b/.releaserc
@@ -68,73 +68,6 @@
],
"message": "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}"
}
- ],
- [
- "@semantic-release/release-notes-generator",
- {
- "preset": "conventionalcommits",
- "parserOpts": {
- "noteKeywords": [
- "BREAKING CHANGE",
- "BREAKING CHANGES",
- "BREAKING"
- ]
- },
- "writerOpts": {
- "commitsSort": [
- "subject",
- "scope"
- ]
- },
- "presetConfig": {
- "types": [
- {
- "type": "feat",
- "section": "๐ Features"
- },
- {
- "type": "fix",
- "section": "๐ Bug Fixes"
- },
- {
- "type": "perf",
- "section": "๐ Performance Improvements"
- },
- {
- "type": "revert",
- "section": "โฉ Reverts"
- },
- {
- "type": "docs",
- "section": "๐ Documentation"
- },
- {
- "type": "style",
- "section": "๐จ Styles"
- },
- {
- "type": "refactor",
- "section": "๐งโ๐ป Code Refactoring"
- },
- {
- "type": "test",
- "section": "โ
Tests"
- },
- {
- "type": "build",
- "section": "๐ค Build System"
- },
- {
- "type": "ci",
- "section": "๐ Continuous Integration"
- },
- {
- "type": "chore",
- "section": "๐งน Chores"
- }
- ]
- }
- }
]
]
}
\ No newline at end of file
diff --git a/README.md b/README.md
index 4976d03..c439b34 100644
--- a/README.md
+++ b/README.md
@@ -192,7 +192,8 @@ You can use the connector as a library in Databricks to ingest data into Qdrant.
- Select `Install New` to open the library installation modal.
- Search for `io.qdrant:spark:VERSION` in the Maven packages and click `Install`.
-
+
+
## Datatype support
diff --git a/pom.xml b/pom.xml
index 279cd0c..a613dad 100644
--- a/pom.xml
+++ b/pom.xml
@@ -39,12 +39,12 @@
io.qdrant
client
- 1.8.0
+ 1.9.0
com.google.guava
guava
- 32.0.0-jre
+ 33.1.0-jre
io.grpc
@@ -56,7 +56,7 @@
org.slf4j
slf4j-api
- 2.0.7
+ 2.0.13
org.apache.spark
@@ -75,13 +75,13 @@
org.testcontainers
qdrant
- 1.19.6
+ 1.19.7
test
org.testcontainers
junit-jupiter
- 1.19.4
+ 1.19.7
test
diff --git a/src/test/python/conftest.py b/src/test/python/conftest.py
index ca7c8dc..11ae480 100644
--- a/src/test/python/conftest.py
+++ b/src/test/python/conftest.py
@@ -1,24 +1,26 @@
import pytest
-from testcontainers.core.container import DockerContainer # type: ignore
-from testcontainers.core.waiting_utils import wait_for_logs # type: ignore
+from testcontainers.qdrant import QdrantContainer
from qdrant_client import QdrantClient, models
import uuid
from pyspark.sql import SparkSession
from typing import NamedTuple
+from uuid import uuid4
QDRANT_GRPC_PORT = 6334
QDRANT_EMBEDDING_DIM = 6
QDRANT_DISTANCE = models.Distance.COSINE
+QDRANT_API_KEY = uuid4().hex
class Qdrant(NamedTuple):
url: str
+ api_key: str
collection_name: str
client: QdrantClient
-qdrant_container = DockerContainer("qdrant/qdrant").with_exposed_ports(QDRANT_GRPC_PORT)
+qdrant_container = QdrantContainer(image="qdrant/qdrant:latest", api_key=QDRANT_API_KEY)
# Reference: https://gist.github.com/dizzythinks/f3bb37fd8ab1484bfec79d39ad8a92d3
@@ -36,9 +38,6 @@ def get_pom_version():
@pytest.fixture(scope="module", autouse=True)
def setup_container(request):
qdrant_container.start()
- wait_for_logs(
- qdrant_container, ".*Actix runtime found; starting in Actix runtime.*", 60
- )
def remove_container():
qdrant_container.stop()
@@ -70,6 +69,8 @@ def qdrant():
host=host,
grpc_port=grpc_port,
prefer_grpc=True,
+ api_key=QDRANT_API_KEY,
+ https=False,
)
collection_name = str(uuid.uuid4())
@@ -99,6 +100,7 @@ def qdrant():
url=f"http://{host}:{grpc_port}",
client=client,
collection_name=collection_name,
+ api_key=QDRANT_API_KEY,
)
return client.close()
diff --git a/src/test/python/requirements.txt b/src/test/python/requirements.txt
index 4758268..f1f579b 100644
--- a/src/test/python/requirements.txt
+++ b/src/test/python/requirements.txt
@@ -1,4 +1,4 @@
pyspark==3.5.1
-pytest==8.0.2
-qdrant-client==1.7.3
-testcontainers==3.7.1
+pytest==8.2.0
+qdrant-client==1.9.0
+testcontainers==4.4.0
diff --git a/src/test/python/test_qdrant_ingest.py b/src/test/python/test_qdrant_ingest.py
index 4b389a9..3bec46b 100644
--- a/src/test/python/test_qdrant_ingest.py
+++ b/src/test/python/test_qdrant_ingest.py
@@ -5,7 +5,7 @@
from .conftest import Qdrant
current_directory = os.path.dirname(__file__)
-input_file_path = os.path.join(current_directory, '..', 'resources', 'users.json')
+input_file_path = os.path.join(current_directory, "..", "resources", "users.json")
def test_upsert_unnamed_vectors(qdrant: Qdrant, spark_session: SparkSession):
@@ -14,12 +14,15 @@ def test_upsert_unnamed_vectors(qdrant: Qdrant, spark_session: SparkSession):
.option("multiline", "true")
.json(str(input_file_path))
)
- df.write.format("io.qdrant.spark.Qdrant").option(
- "qdrant_url",
- qdrant.url,
- ).option("collection_name", qdrant.collection_name).option(
- "embedding_field", "dense_vector"
- ).mode("append").option("schema", df.schema.json()).save()
+ opts = {
+ "qdrant_url": qdrant.url,
+ "collection_name": qdrant.collection_name,
+ "embedding_field": "dense_vector",
+ "api_key": qdrant.api_key,
+ "schema": df.schema.json(),
+ }
+
+ df.write.format("io.qdrant.spark.Qdrant").options(**opts).mode("append").save()
assert (
qdrant.client.count(qdrant.collection_name).count == df.count()
@@ -32,14 +35,15 @@ def test_upsert_named_vectors(qdrant: Qdrant, spark_session: SparkSession):
.option("multiline", "true")
.json(str(input_file_path))
)
- df.write.format("io.qdrant.spark.Qdrant").option(
- "qdrant_url",
- qdrant.url,
- ).option("collection_name", qdrant.collection_name).option(
- "embedding_field", "dense_vector"
- ).option("vector_name", "dense").option("schema", df.schema.json()).mode(
- "append"
- ).save()
+ opts = {
+ "qdrant_url": qdrant.url,
+ "collection_name": qdrant.collection_name,
+ "vector_name": "dense",
+ "schema": df.schema.json(),
+ "api_key": qdrant.api_key,
+ }
+
+ df.write.format("io.qdrant.spark.Qdrant").options(**opts).mode("append").save()
assert (
qdrant.client.count(qdrant.collection_name).count == df.count()
@@ -54,14 +58,16 @@ def test_upsert_multiple_named_dense_vectors(
.option("multiline", "true")
.json(str(input_file_path))
)
- df.write.format("io.qdrant.spark.Qdrant").option(
- "qdrant_url",
- qdrant.url,
- ).option("collection_name", qdrant.collection_name).option(
- "vector_fields", "dense_vector,dense_vector"
- ).option("vector_names", "dense,another_dense").option(
- "schema", df.schema.json()
- ).mode("append").save()
+ opts = {
+ "qdrant_url": qdrant.url,
+ "collection_name": qdrant.collection_name,
+ "vector_fields": "dense_vector,dense_vector",
+ "vector_names": "dense,another_dense",
+ "schema": df.schema.json(),
+ "api_key": qdrant.api_key,
+ }
+
+ df.write.format("io.qdrant.spark.Qdrant").options(**opts).mode("append").save()
assert (
qdrant.client.count(qdrant.collection_name).count == df.count()
@@ -74,14 +80,17 @@ def test_upsert_sparse_vectors(qdrant: Qdrant, spark_session: SparkSession):
.option("multiline", "true")
.json(str(input_file_path))
)
- df.write.format("io.qdrant.spark.Qdrant").option(
- "qdrant_url",
- qdrant.url,
- ).option("collection_name", qdrant.collection_name).option(
- "sparse_vector_value_fields", "sparse_values"
- ).option("sparse_vector_index_fields", "sparse_indices").option(
- "sparse_vector_names", "sparse"
- ).option("schema", df.schema.json()).mode("append").save()
+ opts = {
+ "qdrant_url": qdrant.url,
+ "collection_name": qdrant.collection_name,
+ "sparse_vector_value_fields": "sparse_values",
+ "sparse_vector_index_fields": "sparse_indices",
+ "sparse_vector_names": "sparse",
+ "schema": df.schema.json(),
+ "api_key": qdrant.api_key,
+ }
+
+ df.write.format("io.qdrant.spark.Qdrant").options(**opts).mode("append").save()
assert (
qdrant.client.count(qdrant.collection_name).count == df.count()
@@ -94,14 +103,17 @@ def test_upsert_multiple_sparse_vectors(qdrant: Qdrant, spark_session: SparkSess
.option("multiline", "true")
.json(str(input_file_path))
)
- df.write.format("io.qdrant.spark.Qdrant").option(
- "qdrant_url",
- qdrant.url,
- ).option("collection_name", qdrant.collection_name).option(
- "sparse_vector_value_fields", "sparse_values,sparse_values"
- ).option("sparse_vector_index_fields", "sparse_indices,sparse_indices").option(
- "sparse_vector_names", "sparse,another_sparse"
- ).option("schema", df.schema.json()).mode("append").save()
+ opts = {
+ "qdrant_url": qdrant.url,
+ "collection_name": qdrant.collection_name,
+ "sparse_vector_value_fields": "sparse_values,sparse_values",
+ "sparse_vector_index_fields": "sparse_indices,sparse_indices",
+ "sparse_vector_names": "sparse,another_sparse",
+ "schema": df.schema.json(),
+ "api_key": qdrant.api_key,
+ }
+
+ df.write.format("io.qdrant.spark.Qdrant").options(**opts).mode("append").save()
assert (
qdrant.client.count(qdrant.collection_name).count == df.count()
@@ -114,16 +126,19 @@ def test_upsert_sparse_named_dense_vectors(qdrant: Qdrant, spark_session: SparkS
.option("multiline", "true")
.json(str(input_file_path))
)
- df.write.format("io.qdrant.spark.Qdrant").option(
- "qdrant_url",
- qdrant.url,
- ).option("collection_name", qdrant.collection_name).option(
- "vector_fields", "dense_vector"
- ).option("vector_names", "dense").option(
- "sparse_vector_value_fields", "sparse_values"
- ).option("sparse_vector_index_fields", "sparse_indices").option(
- "sparse_vector_names", "sparse"
- ).option("schema", df.schema.json()).mode("append").save()
+ opts = {
+ "qdrant_url": qdrant.url,
+ "collection_name": qdrant.collection_name,
+ "embedding_field": "dense_vector",
+ "vector_name": "dense",
+ "sparse_vector_value_fields": "sparse_values",
+ "sparse_vector_index_fields": "sparse_indices",
+ "sparse_vector_names": "sparse",
+ "schema": df.schema.json(),
+ "api_key": qdrant.api_key,
+ }
+
+ df.write.format("io.qdrant.spark.Qdrant").options(**opts).mode("append").save()
assert (
qdrant.client.count(qdrant.collection_name).count == df.count()
@@ -138,16 +153,18 @@ def test_upsert_sparse_unnamed_dense_vectors(
.option("multiline", "true")
.json(str(input_file_path))
)
- df.write.format("io.qdrant.spark.Qdrant").option(
- "qdrant_url",
- qdrant.url,
- ).option("collection_name", qdrant.collection_name).option(
- "embedding_field", "dense_vector"
- ).option("sparse_vector_value_fields", "sparse_values").option(
- "sparse_vector_index_fields", "sparse_indices"
- ).option("sparse_vector_names", "sparse").option("schema", df.schema.json()).mode(
- "append"
- ).save()
+ opts = {
+ "qdrant_url": qdrant.url,
+ "collection_name": qdrant.collection_name,
+ "embedding_field": "dense_vector",
+ "sparse_vector_value_fields": "sparse_values",
+ "sparse_vector_index_fields": "sparse_indices",
+ "sparse_vector_names": "sparse",
+ "schema": df.schema.json(),
+ "api_key": qdrant.api_key,
+ }
+
+ df.write.format("io.qdrant.spark.Qdrant").options(**opts).mode("append").save()
assert (
qdrant.client.count(qdrant.collection_name).count == df.count()
@@ -162,16 +179,19 @@ def test_upsert_multiple_sparse_dense_vectors(
.option("multiline", "true")
.json(str(input_file_path))
)
- df.write.format("io.qdrant.spark.Qdrant").option(
- "qdrant_url",
- qdrant.url,
- ).option("collection_name", qdrant.collection_name).option(
- "vector_fields", "dense_vector,dense_vector"
- ).option("vector_names", "dense,another_dense").option(
- "sparse_vector_value_fields", "sparse_values,sparse_values"
- ).option("sparse_vector_index_fields", "sparse_indices,sparse_indices").option(
- "sparse_vector_names", "sparse,another_sparse"
- ).option("schema", df.schema.json()).mode("append").save()
+ opts = {
+ "qdrant_url": qdrant.url,
+ "collection_name": qdrant.collection_name,
+ "embedding_field": "dense_vector",
+ "vector_name": "dense",
+ "sparse_vector_value_fields": "sparse_values,sparse_values",
+ "sparse_vector_index_fields": "sparse_indices,sparse_indices",
+ "sparse_vector_names": "sparse,another_sparse",
+ "schema": df.schema.json(),
+ "api_key": qdrant.api_key,
+ }
+
+ df.write.format("io.qdrant.spark.Qdrant").options(**opts).mode("append").save()
assert (
qdrant.client.count(qdrant.collection_name).count == df.count()
@@ -185,12 +205,13 @@ def test_upsert_without_vectors(qdrant: Qdrant, spark_session: SparkSession):
.option("multiline", "true")
.json(str(input_file_path))
)
- df.write.format("io.qdrant.spark.Qdrant").option(
- "qdrant_url",
- qdrant.url,
- ).option("collection_name", qdrant.collection_name).option(
- "schema", df.schema.json()
- ).mode("append").save()
+ opts = {
+ "qdrant_url": qdrant.url,
+ "collection_name": qdrant.collection_name,
+ "schema": df.schema.json(),
+ "api_key": qdrant.api_key,
+ }
+ df.write.format("io.qdrant.spark.Qdrant").options(**opts).mode("append").save()
assert (
qdrant.client.count(qdrant.collection_name).count == df.count()
@@ -204,18 +225,15 @@ def test_custom_id_field(qdrant: Qdrant, spark_session: SparkSession):
.json(str(input_file_path))
)
- df = (
- spark_session.read.schema(schema)
- .option("multiline", "true")
- .json(str(input_file_path))
- )
- df.write.format("io.qdrant.spark.Qdrant").option(
- "qdrant_url",
- qdrant.url,
- ).option("collection_name", qdrant.collection_name).option(
- "embedding_field", "dense_vector"
- ).option("schema", df.schema.json()).option("vector_name", "dense").option(
- "id_field", "id"
- ).mode("append").save()
+ opts = {
+ "qdrant_url": qdrant.url,
+ "collection_name": qdrant.collection_name,
+ "embedding_field": "dense_vector",
+ "vector_name": "dense",
+ "id_field": "id",
+ "schema": df.schema.json(),
+ "api_key": qdrant.api_key,
+ }
+ df.write.format("io.qdrant.spark.Qdrant").options(**opts).mode("append").save()
assert len(qdrant.client.retrieve(qdrant.collection_name, [1, 2, 3, 15, 18])) == 5