diff --git a/.asf.yaml b/.asf.yaml
index 22042b355b2fa..3935a525ff3c4 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -31,6 +31,8 @@ github:
merge: false
squash: true
rebase: true
+ ghp_branch: master
+ ghp_path: /docs
notifications:
pullrequests: reviews@spark.apache.org
diff --git a/.github/labeler.yml b/.github/labeler.yml
index a74b4ab5d5a33..6617acbf9187e 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -26,16 +26,14 @@ INFRA:
'.asf.yaml',
'.gitattributes',
'.gitignore',
- 'dev/merge_spark_pr.py',
- 'dev/run-tests-jenkins*'
+ 'dev/merge_spark_pr.py'
]
BUILD:
- changed-files:
- all-globs-to-any-file: [
'dev/**/*',
- '!dev/merge_spark_pr.py',
- '!dev/run-tests-jenkins*'
+ '!dev/merge_spark_pr.py'
]
- any-glob-to-any-file: [
'build/**/*',
@@ -199,6 +197,7 @@ YARN:
KUBERNETES:
- changed-files:
- any-glob-to-any-file: [
+ 'bin/docker-image-tool.sh',
'resource-managers/kubernetes/**/*'
]
@@ -225,7 +224,7 @@ DEPLOY:
CONNECT:
- changed-files:
- any-glob-to-any-file: [
- 'connect/**/*',
+ 'sql/connect/**/*',
'connector/connect/**/*',
'python/pyspark/sql/**/connect/**/*',
'python/pyspark/ml/**/connect/**/*'
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 576f64f3a0869..3117872e21680 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -58,6 +58,12 @@ jobs:
outputs:
required: ${{ steps.set-outputs.outputs.required }}
image_url: ${{ steps.infra-image-outputs.outputs.image_url }}
+ image_docs_url: ${{ steps.infra-image-docs-outputs.outputs.image_docs_url }}
+ image_docs_url_link: ${{ steps.infra-image-link.outputs.image_docs_url_link }}
+ image_lint_url: ${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}
+ image_lint_url_link: ${{ steps.infra-image-link.outputs.image_lint_url_link }}
+ image_sparkr_url: ${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }}
+ image_sparkr_url_link: ${{ steps.infra-image-link.outputs.image_sparkr_url_link }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
@@ -134,6 +140,44 @@ jobs:
IMG_NAME="apache-spark-ci-image:${{ inputs.branch }}-${{ github.run_id }}"
IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
echo "image_url=$IMG_URL" >> $GITHUB_OUTPUT
+ - name: Generate infra image URL (Documentation)
+ id: infra-image-docs-outputs
+ run: |
+ # Convert to lowercase to meet Docker repo name requirement
+ REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]')
+ IMG_NAME="apache-spark-ci-image-docs:${{ inputs.branch }}-${{ github.run_id }}"
+ IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
+ echo "image_docs_url=$IMG_URL" >> $GITHUB_OUTPUT
+ - name: Generate infra image URL (Linter)
+ id: infra-image-lint-outputs
+ run: |
+ # Convert to lowercase to meet Docker repo name requirement
+ REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]')
+ IMG_NAME="apache-spark-ci-image-lint:${{ inputs.branch }}-${{ github.run_id }}"
+ IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
+ echo "image_lint_url=$IMG_URL" >> $GITHUB_OUTPUT
+ - name: Generate infra image URL (SparkR)
+ id: infra-image-sparkr-outputs
+ run: |
+ # Convert to lowercase to meet Docker repo name requirement
+ REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]')
+ IMG_NAME="apache-spark-ci-image-sparkr:${{ inputs.branch }}-${{ github.run_id }}"
+ IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
+ echo "image_sparkr_url=$IMG_URL" >> $GITHUB_OUTPUT
+ - name: Link the docker images
+ id: infra-image-link
+ run: |
+ # Set the image URL for job "docs"
+ # Should delete the link and directly use image_docs_url after SPARK 3.x EOL
+ if [[ "${{ inputs.branch }}" == 'branch-3.5' ]]; then
+ echo "image_docs_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
+ echo "image_lint_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
+ echo "image_sparkr_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
+ else
+ echo "image_docs_url_link=${{ steps.infra-image-docs-outputs.outputs.image_docs_url }}" >> $GITHUB_OUTPUT
+ echo "image_lint_url_link=${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}" >> $GITHUB_OUTPUT
+ echo "image_sparkr_url_link=${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }}" >> $GITHUB_OUTPUT
+ fi
# Build: build Spark and run the tests for specified modules.
build:
@@ -264,20 +308,20 @@ jobs:
with:
distribution: zulu
java-version: ${{ matrix.java }}
- - name: Install Python 3.9
+ - name: Install Python 3.11
uses: actions/setup-python@v5
# We should install one Python that is higher than 3+ for SQL and Yarn because:
# - SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
# - Yarn has a Python specific test too, for example, YarnClusterSuite.
if: contains(matrix.modules, 'yarn') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect')
with:
- python-version: '3.9'
+ python-version: '3.11'
architecture: x64
- - name: Install Python packages (Python 3.9)
+ - name: Install Python packages (Python 3.11)
if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect')
run: |
- python3.9 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.62.0' 'grpcio-status==1.62.0' 'protobuf==4.25.1'
- python3.9 -m pip list
+ python3.11 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.28.3'
+ python3.11 -m pip list
# Run the tests.
- name: Run tests
env: ${{ fromJSON(inputs.envs) }}
@@ -304,7 +348,7 @@ jobs:
uses: actions/upload-artifact@v4
with:
name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
- path: "**/target/unit-tests.log"
+ path: "**/target/*.log"
infra-image:
name: "Base image build"
@@ -343,7 +387,7 @@ jobs:
uses: docker/setup-buildx-action@v3
- name: Build and push
id: docker_build
- uses: docker/build-push-action@v5
+ uses: docker/build-push-action@v6
with:
context: ./dev/infra/
push: true
@@ -351,6 +395,40 @@ jobs:
${{ needs.precondition.outputs.image_url }}
# Use the infra image cache to speed up
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-cache:${{ inputs.branch }}
+ - name: Build and push (Documentation)
+ if: hashFiles('dev/spark-test-image/docs/Dockerfile') != ''
+ id: docker_build_docs
+ uses: docker/build-push-action@v6
+ with:
+ context: ./dev/spark-test-image/docs/
+ push: true
+ tags: |
+ ${{ needs.precondition.outputs.image_docs_url }}
+ # Use the infra image cache to speed up
+ cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-docs-cache:${{ inputs.branch }}
+ - name: Build and push (Linter)
+ if: hashFiles('dev/spark-test-image/lint/Dockerfile') != ''
+ id: docker_build_lint
+ uses: docker/build-push-action@v6
+ with:
+ context: ./dev/spark-test-image/lint/
+ push: true
+ tags: |
+ ${{ needs.precondition.outputs.image_lint_url }}
+ # Use the infra image cache to speed up
+ cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ inputs.branch }}
+ - name: Build and push (SparkR)
+ if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != ''
+ id: docker_build_sparkr
+ uses: docker/build-push-action@v6
+ with:
+ context: ./dev/spark-test-image/sparkr/
+ push: true
+ tags: |
+ ${{ needs.precondition.outputs.image_sparkr_url }}
+ # Use the infra image cache to speed up
+ cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ inputs.branch }}
+
pyspark:
needs: [precondition, infra-image]
@@ -509,7 +587,7 @@ jobs:
runs-on: ubuntu-latest
timeout-minutes: 180
container:
- image: ${{ needs.precondition.outputs.image_url }}
+ image: ${{ needs.precondition.outputs.image_sparkr_url_link }}
env:
HADOOP_PROFILE: ${{ inputs.hadoop }}
HIVE_PROFILE: hive2.3
@@ -606,18 +684,22 @@ jobs:
- name: Breaking change detection against branch-3.5
uses: bufbuild/buf-breaking-action@v1
with:
- input: connect/common/src/main
+ input: sql/connect/common/src/main
against: 'https://github.com/apache/spark.git#branch=branch-3.5,subdir=connector/connect/common/src/main'
- - name: Install Python 3.9
+ - name: Install Python 3.11
uses: actions/setup-python@v5
with:
- python-version: '3.9'
+ python-version: '3.11'
- name: Install dependencies for Python CodeGen check
run: |
- python3.9 -m pip install 'black==23.9.1' 'protobuf==4.25.1' 'mypy==1.8.0' 'mypy-protobuf==3.3.0'
- python3.9 -m pip list
- - name: Python CodeGen check
+ python3.11 -m pip install 'black==23.9.1' 'protobuf==5.28.3' 'mypy==1.8.0' 'mypy-protobuf==3.3.0'
+ python3.11 -m pip list
+ - name: Python CodeGen check for branch-3.5
+ if: inputs.branch == 'branch-3.5'
run: ./dev/connect-check-protos.py
+ - name: Python CodeGen check
+ if: inputs.branch != 'branch-3.5'
+ run: ./dev/check-protos.py
# Static analysis
lint:
@@ -635,7 +717,7 @@ jobs:
PYSPARK_PYTHON: python3.9
GITHUB_PREV_SHA: ${{ github.event.before }}
container:
- image: ${{ needs.precondition.outputs.image_url }}
+ image: ${{ needs.precondition.outputs.image_lint_url_link }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
@@ -702,13 +784,6 @@ jobs:
run: ./dev/lint-java
- name: Spark connect jvm client mima check
run: ./dev/connect-jvm-client-mima-check
- - name: Install Python linter dependencies for branch-3.4
- if: inputs.branch == 'branch-3.4'
- run: |
- # SPARK-44554: Copy from https://github.com/apache/spark/blob/a05c27e85829fe742c1828507a1fd180cdc84b54/.github/workflows/build_and_test.yml#L571-L578
- # Should delete this section after SPARK 3.4 EOL.
- python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.920' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0'
- python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.48.1' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0'
- name: Install Python linter dependencies for branch-3.5
if: inputs.branch == 'branch-3.5'
run: |
@@ -716,18 +791,8 @@ jobs:
# Should delete this section after SPARK 3.5 EOL.
python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0'
python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.56.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0'
- - name: Install Python dependencies for python linter and documentation generation
- if: inputs.branch != 'branch-3.4' && inputs.branch != 'branch-3.5'
- run: |
- # Should unpin 'sphinxcontrib-*' after upgrading sphinx>5
- # See 'ipython_genutils' in SPARK-38517
- # See 'docutils<0.18.0' in SPARK-39421
- python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
- ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \
- 'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \
- 'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
- 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5'
- python3.9 -m pip list
+ - name: List Python packages
+ run: python3.9 -m pip list
- name: Python linter
run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python
# Should delete this section after SPARK 3.5 EOL.
@@ -745,16 +810,16 @@ jobs:
if: inputs.branch == 'branch-3.5'
run: if test -f ./dev/connect-check-protos.py; then PATH=$PATH:$HOME/buf/bin PYTHON_EXECUTABLE=python3.9 ./dev/connect-check-protos.py; fi
# Should delete this section after SPARK 3.5 EOL.
- - name: Install JavaScript linter dependencies for branch-3.4, branch-3.5
- if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5'
+ - name: Install JavaScript linter dependencies for branch-3.5
+ if: inputs.branch == 'branch-3.5'
run: |
apt update
apt-get install -y nodejs npm
- name: JS linter
run: ./dev/lint-js
# Should delete this section after SPARK 3.5 EOL.
- - name: Install R linter dependencies for branch-3.4, branch-3.5
- if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5'
+ - name: Install R linter dependencies for branch-3.5
+ if: inputs.branch == 'branch-3.5'
run: |
apt update
apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev \
@@ -783,7 +848,7 @@ jobs:
PYSPARK_PYTHON: python3.9
GITHUB_PREV_SHA: ${{ github.event.before }}
container:
- image: ${{ needs.precondition.outputs.image_url }}
+ image: ${{ needs.precondition.outputs.image_docs_url_link }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
@@ -833,20 +898,8 @@ jobs:
with:
distribution: zulu
java-version: ${{ inputs.java }}
- - name: Install Python dependencies for python linter and documentation generation
- if: inputs.branch != 'branch-3.4' && inputs.branch != 'branch-3.5'
- run: |
- # Should unpin 'sphinxcontrib-*' after upgrading sphinx>5
- # See 'ipython_genutils' in SPARK-38517
- # See 'docutils<0.18.0' in SPARK-39421
- python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
- ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \
- 'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \
- 'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
- 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5'
- python3.9 -m pip list
- - name: Install dependencies for documentation generation for branch-3.4, branch-3.5
- if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5'
+ - name: Install dependencies for documentation generation for branch-3.5
+ if: inputs.branch == 'branch-3.5'
run: |
# pandoc is required to generate PySpark APIs as well in nbsphinx.
apt-get update -y
@@ -860,6 +913,8 @@ jobs:
python3.9 -m pip install ipython_genutils # See SPARK-38517
python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8'
python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421
+ - name: List Python packages
+ run: python3.9 -m pip list
- name: Install dependencies for documentation generation
run: |
# Keep the version of Bundler here in sync with the following locations:
@@ -867,20 +922,26 @@ jobs:
# - docs/README.md
gem install bundler -v 2.4.22
cd docs
- bundle install
+ bundle install --retry=100
- name: Run documentation build
run: |
- # We need this link because the jekyll build calls `python`.
- ln -s "$(which python3.9)" "/usr/local/bin/python"
+ # We need this link to make sure `python3` points to `python3.9` which contains the prerequisite packages.
+ ln -s "$(which python3.9)" "/usr/local/bin/python3"
# Build docs first with SKIP_API to ensure they are buildable without requiring any
# language docs to be built beforehand.
- cd docs; SKIP_API=1 bundle exec jekyll build; cd ..
+ cd docs; SKIP_ERRORDOC=1 SKIP_API=1 bundle exec jekyll build; cd ..
if [ -f "./dev/is-changed.py" ]; then
# Skip PySpark and SparkR docs while keeping Scala/Java/SQL docs
pyspark_modules=`cd dev && python3.9 -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"`
if [ `./dev/is-changed.py -m $pyspark_modules` = false ]; then export SKIP_PYTHONDOC=1; fi
if [ `./dev/is-changed.py -m sparkr` = false ]; then export SKIP_RDOC=1; fi
fi
+ # Print the values of environment variables `SKIP_ERRORDOC`, `SKIP_SCALADOC`, `SKIP_PYTHONDOC`, `SKIP_RDOC` and `SKIP_SQLDOC`
+ echo "SKIP_ERRORDOC: $SKIP_ERRORDOC"
+ echo "SKIP_SCALADOC: $SKIP_SCALADOC"
+ echo "SKIP_PYTHONDOC: $SKIP_PYTHONDOC"
+ echo "SKIP_RDOC: $SKIP_RDOC"
+ echo "SKIP_SQLDOC: $SKIP_SQLDOC"
cd docs
bundle exec jekyll build
- name: Tar documentation
@@ -1106,14 +1167,16 @@ jobs:
with:
distribution: zulu
java-version: ${{ inputs.java }}
- - name: start minikube
+ - name: Install R
run: |
- # See more in "Installation" https://minikube.sigs.k8s.io/docs/start/
- curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64
- sudo install minikube-linux-amd64 /usr/local/bin/minikube
- rm minikube-linux-amd64
+ sudo apt update
+ sudo apt-get install r-base
+ - name: Start Minikube
+ uses: medyagh/setup-minikube@v0.0.18
+ with:
# Github Action limit cpu:2, memory: 6947MB, limit to 2U6G for better resource statistic
- minikube start --cpus 2 --memory 6144
+ cpus: 2
+ memory: 6144m
- name: Print K8S pods and nodes info
run: |
kubectl get pods -A
@@ -1126,11 +1189,10 @@ jobs:
export PVC_TESTS_VM_PATH=$PVC_TMP_DIR
minikube mount ${PVC_TESTS_HOST_PATH}:${PVC_TESTS_VM_PATH} --gid=0 --uid=185 &
kubectl create clusterrolebinding serviceaccounts-cluster-admin --clusterrole=cluster-admin --group=system:serviceaccounts || true
- kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.8.2/installer/volcano-development.yaml || true
- if [[ "${{ inputs.branch }}" == 'branch-3.5' || "${{ inputs.branch }}" == 'branch-3.4' ]]; then
+ if [[ "${{ inputs.branch }}" == 'branch-3.5' ]]; then
kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.7.0/installer/volcano-development.yaml || true
else
- kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.8.2/installer/volcano-development.yaml || true
+ kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.9.0/installer/volcano-development.yaml || true
fi
eval $(minikube docker-env)
build/sbt -Phadoop-3 -Psparkr -Pkubernetes -Pvolcano -Pkubernetes-integration-tests -Dspark.kubernetes.test.volcanoMaxConcurrencyJobNum=1 -Dtest.exclude.tags=local "kubernetes-integration-tests/test"
diff --git a/.github/workflows/build_infra_images_cache.yml b/.github/workflows/build_infra_images_cache.yml
index 49b2e2e80d9ac..a6beacedeebd4 100644
--- a/.github/workflows/build_infra_images_cache.yml
+++ b/.github/workflows/build_infra_images_cache.yml
@@ -27,6 +27,9 @@ on:
- 'branch-*'
paths:
- 'dev/infra/Dockerfile'
+ - 'dev/spark-test-image/docs/Dockerfile'
+ - 'dev/spark-test-image/lint/Dockerfile'
+ - 'dev/spark-test-image/sparkr/Dockerfile'
- '.github/workflows/build_infra_images_cache.yml'
# Create infra image when cutting down branches/tags
create:
@@ -51,7 +54,7 @@ jobs:
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push
id: docker_build
- uses: docker/build-push-action@v5
+ uses: docker/build-push-action@v6
with:
context: ./dev/infra/
push: true
@@ -60,3 +63,42 @@ jobs:
cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-cache:${{ github.ref_name }},mode=max
- name: Image digest
run: echo ${{ steps.docker_build.outputs.digest }}
+ - name: Build and push (Documentation)
+ if: hashFiles('dev/spark-test-image/docs/Dockerfile') != ''
+ id: docker_build_docs
+ uses: docker/build-push-action@v6
+ with:
+ context: ./dev/spark-test-image/docs/
+ push: true
+ tags: ghcr.io/apache/spark/apache-spark-github-action-image-docs-cache:${{ github.ref_name }}-static
+ cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-docs-cache:${{ github.ref_name }}
+ cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-docs-cache:${{ github.ref_name }},mode=max
+ - name: Image digest (Documentation)
+ if: hashFiles('dev/spark-test-image/docs/Dockerfile') != ''
+ run: echo ${{ steps.docker_build_docs.outputs.digest }}
+ - name: Build and push (Linter)
+ if: hashFiles('dev/spark-test-image/lint/Dockerfile') != ''
+ id: docker_build_lint
+ uses: docker/build-push-action@v6
+ with:
+ context: ./dev/spark-test-image/lint/
+ push: true
+ tags: ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ github.ref_name }}-static
+ cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ github.ref_name }}
+ cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ github.ref_name }},mode=max
+ - name: Image digest (Linter)
+ if: hashFiles('dev/spark-test-image/lint/Dockerfile') != ''
+ run: echo ${{ steps.docker_build_lint.outputs.digest }}
+ - name: Build and push (SparkR)
+ if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != ''
+ id: docker_build_sparkr
+ uses: docker/build-push-action@v6
+ with:
+ context: ./dev/spark-test-image/sparkr/
+ push: true
+ tags: ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }}-static
+ cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }}
+ cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }},mode=max
+ - name: Image digest (SparkR)
+ if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != ''
+ run: echo ${{ steps.docker_build_sparkr.outputs.digest }}
diff --git a/.github/workflows/build_maven.yml b/.github/workflows/build_maven.yml
index c3a23c02f6a61..b5546c61eb11b 100644
--- a/.github/workflows/build_maven.yml
+++ b/.github/workflows/build_maven.yml
@@ -30,8 +30,3 @@ jobs:
name: Run
uses: ./.github/workflows/maven_test.yml
if: github.repository == 'apache/spark'
- with:
- envs: >-
- {
- "SKIP_SPARK_RELEASE_VERSIONS": "3.4.2"
- }
diff --git a/.github/workflows/build_maven_java21_macos14.yml b/.github/workflows/build_maven_java21_macos15.yml
similarity index 92%
rename from .github/workflows/build_maven_java21_macos14.yml
rename to .github/workflows/build_maven_java21_macos15.yml
index fb5e609f4eae0..cc6d0ea4e90da 100644
--- a/.github/workflows/build_maven_java21_macos14.yml
+++ b/.github/workflows/build_maven_java21_macos15.yml
@@ -17,7 +17,7 @@
# under the License.
#
-name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 21, macos-14)"
+name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 21, MacOS-15)"
on:
schedule:
@@ -32,7 +32,7 @@ jobs:
if: github.repository == 'apache/spark'
with:
java: 21
- os: macos-14
+ os: macos-15
envs: >-
{
"OBJC_DISABLE_INITIALIZE_FORK_SAFETY": "YES"
diff --git a/.github/workflows/build_python_3.11_macos.yml b/.github/workflows/build_python_3.11_macos.yml
new file mode 100644
index 0000000000000..4caae55b5fea8
--- /dev/null
+++ b/.github/workflows/build_python_3.11_macos.yml
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: "Build / Python-only (master, Python 3.11, MacOS)"
+
+on:
+ schedule:
+ - cron: '0 21 * * *'
+
+jobs:
+ run-build:
+ permissions:
+ packages: write
+ name: Run
+ uses: ./.github/workflows/python_macos_test.yml
+ if: github.repository == 'apache/spark'
diff --git a/.github/workflows/build_branch34.yml b/.github/workflows/build_python_3.13.yml
similarity index 70%
rename from .github/workflows/build_branch34.yml
rename to .github/workflows/build_python_3.13.yml
index deb6c42407970..6f67cf383584f 100644
--- a/.github/workflows/build_branch34.yml
+++ b/.github/workflows/build_python_3.13.yml
@@ -17,11 +17,11 @@
# under the License.
#
-name: "Build (branch-3.4, Scala 2.13, Hadoop 3, JDK 8)"
+name: "Build / Python-only (master, Python 3.13)"
on:
schedule:
- - cron: '0 9 * * *'
+ - cron: '0 20 * * *'
jobs:
run-build:
@@ -31,21 +31,15 @@ jobs:
uses: ./.github/workflows/build_and_test.yml
if: github.repository == 'apache/spark'
with:
- java: 8
- branch: branch-3.4
+ java: 17
+ branch: master
hadoop: hadoop3
envs: >-
{
- "SCALA_PROFILE": "scala2.13",
- "PYTHON_TO_TEST": "",
- "ORACLE_DOCKER_IMAGE_NAME": "gvenzl/oracle-xe:21.3.0"
+ "PYTHON_TO_TEST": "python3.13"
}
jobs: >-
{
- "build": "true",
- "sparkr": "true",
- "tpcds-1g": "true",
- "docker-integration-tests": "true",
- "k8s-integration-tests": "true",
- "lint" : "true"
+ "pyspark": "true",
+ "pyspark-pandas": "true"
}
diff --git a/.github/workflows/build_branch34_python.yml b/.github/workflows/build_python_3.9.yml
similarity index 88%
rename from .github/workflows/build_branch34_python.yml
rename to .github/workflows/build_python_3.9.yml
index c109ba2dc7922..b2401fcf2aa14 100644
--- a/.github/workflows/build_branch34_python.yml
+++ b/.github/workflows/build_python_3.9.yml
@@ -17,11 +17,11 @@
# under the License.
#
-name: "Build / Python-only (branch-3.4)"
+name: "Build / Python-only (master, Python 3.9)"
on:
schedule:
- - cron: '0 9 * * *'
+ - cron: '0 21 * * *'
jobs:
run-build:
@@ -31,12 +31,12 @@ jobs:
uses: ./.github/workflows/build_and_test.yml
if: github.repository == 'apache/spark'
with:
- java: 8
- branch: branch-3.4
+ java: 17
+ branch: master
hadoop: hadoop3
envs: >-
{
- "PYTHON_TO_TEST": ""
+ "PYTHON_TO_TEST": "python3.9"
}
jobs: >-
{
diff --git a/.github/workflows/build_python_connect.yml b/.github/workflows/build_python_connect.yml
index 8458cdf771b19..471ad31279da4 100644
--- a/.github/workflows/build_python_connect.yml
+++ b/.github/workflows/build_python_connect.yml
@@ -71,7 +71,7 @@ jobs:
python packaging/connect/setup.py sdist
cd dist
pip install pyspark*connect-*.tar.gz
- pip install 'six==1.16.0' 'pandas<=2.2.2' scipy 'plotly>=4.8' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' 'graphviz==0.20.3' torch torchvision torcheval deepspeed unittest-xml-reporting
+ pip install 'six==1.16.0' 'pandas<=2.2.2' scipy 'plotly>=4.8' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' 'graphviz==0.20.3' torch torchvision torcheval deepspeed unittest-xml-reporting 'plotly>=4.8'
- name: Run tests
env:
SPARK_TESTING: 1
@@ -84,7 +84,7 @@ jobs:
# Start a Spark Connect server for local
PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \
--driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \
- --jars "`find connect/server/target -name spark-connect-*SNAPSHOT.jar`,`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
+ --jars "`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
# Remove Py4J and PySpark zipped library to make sure there is no JVM connection
mv python/lib lib.back
@@ -93,7 +93,7 @@ jobs:
# Several tests related to catalog requires to run them sequencially, e.g., writing a table in a listener.
./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect,pyspark-ml-connect
# None of tests are dependent on each other in Pandas API on Spark so run them in parallel
- ./python/run-tests --parallelism=4 --python-executables=python3 --modules pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3
+ ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3
# Stop Spark Connect server.
./sbin/stop-connect-server.sh
@@ -104,7 +104,7 @@ jobs:
PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \
--master "local-cluster[2, 4, 1024]" \
--driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \
- --jars "`find connect/server/target -name spark-connect-*SNAPSHOT.jar`,`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
+ --jars "`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
# Remove Py4J and PySpark zipped library to make sure there is no JVM connection
mv python/lib lib.back
diff --git a/.github/workflows/build_python_connect35.yml b/.github/workflows/build_python_connect35.yml
index b00fdddb4b0e0..ad250d95fb844 100644
--- a/.github/workflows/build_python_connect35.yml
+++ b/.github/workflows/build_python_connect35.yml
@@ -70,7 +70,7 @@ jobs:
pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*'
# Add Python deps for Spark Connect.
- pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4' 'graphviz==0.20.3'
+ pip install 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.28.3' 'googleapis-common-protos==1.65.0' 'graphviz==0.20.3'
# Add torch as a testing dependency for TorchDistributor
pip install 'torch==2.0.1' 'torchvision==0.15.2' torcheval
@@ -87,7 +87,7 @@ jobs:
# Start a Spark Connect server for local
PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \
--driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \
- --jars "`find connect/server/target -name spark-connect-*SNAPSHOT.jar`,`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
+ --jars "`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
# Checkout to branch-3.5 to use the tests in branch-3.5.
cd ..
@@ -98,7 +98,7 @@ jobs:
# Run branch-3.5 tests
./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect
# None of tests are dependent on each other in Pandas API on Spark so run them in parallel
- ./python/run-tests --parallelism=4 --python-executables=python3 --modules pyspark-pandas-connect,pyspark-pandas-slow-connect
+ ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-pandas-connect,pyspark-pandas-slow-connect
- name: Upload test results to report
if: always()
uses: actions/upload-artifact@v4
diff --git a/.github/workflows/build_python_pypy3.9.yml b/.github/workflows/build_python_pypy3.10.yml
similarity index 96%
rename from .github/workflows/build_python_pypy3.9.yml
rename to .github/workflows/build_python_pypy3.10.yml
index e05071ef034a0..163af2f4aec8b 100644
--- a/.github/workflows/build_python_pypy3.9.yml
+++ b/.github/workflows/build_python_pypy3.10.yml
@@ -17,7 +17,7 @@
# under the License.
#
-name: "Build / Python-only (master, PyPy 3.9)"
+name: "Build / Python-only (master, PyPy 3.10)"
on:
schedule:
diff --git a/.github/workflows/build_sparkr_window.yml b/.github/workflows/build_sparkr_window.yml
index cf879d9ebd306..b97251a461715 100644
--- a/.github/workflows/build_sparkr_window.yml
+++ b/.github/workflows/build_sparkr_window.yml
@@ -16,7 +16,7 @@
# specific language governing permissions and limitations
# under the License.
#
-name: "Build / SparkR-only (master, 4.4.1, windows-2022)"
+name: "Build / SparkR-only (master, 4.4.2, windows-2022)"
on:
schedule:
@@ -50,10 +50,10 @@ jobs:
with:
distribution: zulu
java-version: 17
- - name: Install R 4.4.1
+ - name: Install R 4.4.2
uses: r-lib/actions/setup-r@v2
with:
- r-version: 4.4.1
+ r-version: 4.4.2
- name: Install R dependencies
run: |
Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow', 'xml2'), repos='https://cloud.r-project.org/')"
@@ -85,6 +85,7 @@ jobs:
shell: cmd
env:
NOT_CRAN: true
+ SPARKR_SUPPRESS_DEPRECATION_WARNING: 1
# See SPARK-27848. Currently installing some dependent packages causes
# "(converted from warning) unable to identify current timezone 'C':" for an unknown reason.
# This environment variable works around to test SparkR against a higher version.
diff --git a/.github/workflows/maven_test.yml b/.github/workflows/maven_test.yml
index fa30bd3abc8a5..6965fb4968af3 100644
--- a/.github/workflows/maven_test.yml
+++ b/.github/workflows/maven_test.yml
@@ -40,7 +40,7 @@ on:
description: OS to run this build.
required: false
type: string
- default: ubuntu-22.04
+ default: ubuntu-latest
envs:
description: Additional environment variables to set when running the tests. Should be in JSON format.
required: false
@@ -178,7 +178,7 @@ jobs:
- name: Install Python packages (Python 3.11)
if: (contains(matrix.modules, 'sql#core')) || contains(matrix.modules, 'connect')
run: |
- python3.11 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'grpcio==1.62.0' 'grpcio-status==1.62.0' 'protobuf==4.25.1'
+ python3.11 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.28.3'
python3.11 -m pip list
# Run the tests.
- name: Run tests
@@ -194,7 +194,7 @@ jobs:
if [[ "$INCLUDED_TAGS" != "" ]]; then
./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pjvm-profiler -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} -Dtest.include.tags="$INCLUDED_TAGS" test -fae
elif [[ "$MODULES_TO_TEST" == "connect" ]]; then
- ./build/mvn $MAVEN_CLI_OPTS -Dtest.exclude.tags="$EXCLUDED_TAGS" -Djava.version=${JAVA_VERSION/-ea} -pl connector/connect/client/jvm,connect/common,connect/server test -fae
+ ./build/mvn $MAVEN_CLI_OPTS -Dtest.exclude.tags="$EXCLUDED_TAGS" -Djava.version=${JAVA_VERSION/-ea} -pl connector/connect/client/jvm,sql/connect/common,sql/connect/server test -fae
elif [[ "$EXCLUDED_TAGS" != "" ]]; then
./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pjvm-profiler -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} -Dtest.exclude.tags="$EXCLUDED_TAGS" test -fae
elif [[ "$MODULES_TO_TEST" == *"sql#hive-thriftserver"* ]]; then
diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml
new file mode 100644
index 0000000000000..8729012c2b8d2
--- /dev/null
+++ b/.github/workflows/pages.yml
@@ -0,0 +1,98 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: GitHub Pages deployment
+
+on:
+ push:
+ branches:
+ - master
+
+concurrency:
+ group: 'docs preview'
+ cancel-in-progress: false
+
+jobs:
+ docs:
+ name: Build and deploy documentation
+ runs-on: ubuntu-latest
+ permissions:
+ id-token: write
+ pages: write
+ environment:
+ name: github-pages # https://github.com/actions/deploy-pages/issues/271
+ env:
+ SPARK_TESTING: 1 # Reduce some noise in the logs
+ RELEASE_VERSION: 'In-Progress'
+ if: github.repository == 'apache/spark'
+ steps:
+ - name: Checkout Spark repository
+ uses: actions/checkout@v4
+ with:
+ repository: apache/spark
+ ref: 'master'
+ - name: Install Java 17
+ uses: actions/setup-java@v4
+ with:
+ distribution: zulu
+ java-version: 17
+ - name: Install Python 3.9
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.9'
+ architecture: x64
+ cache: 'pip'
+ - name: Install Python dependencies
+ run: |
+ pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
+ ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow 'pandas==2.2.3' 'plotly>=4.8' 'docutils<0.18.0' \
+ 'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \
+ 'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpcio-status==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
+ 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5'
+ - name: Install Ruby for documentation generation
+ uses: ruby/setup-ruby@v1
+ with:
+ ruby-version: '3.3'
+ bundler-cache: true
+ - name: Install Pandoc
+ run: |
+ sudo apt-get update -y
+ sudo apt-get install pandoc
+ - name: Install dependencies for documentation generation
+ run: |
+ cd docs
+ gem install bundler -v 2.4.22 -n /usr/local/bin
+ bundle install --retry=100
+ - name: Run documentation build
+ run: |
+ sed -i".tmp1" 's/SPARK_VERSION:.*$/SPARK_VERSION: '"$RELEASE_VERSION"'/g' docs/_config.yml
+ sed -i".tmp2" 's/SPARK_VERSION_SHORT:.*$/SPARK_VERSION_SHORT: '"$RELEASE_VERSION"'/g' docs/_config.yml
+ sed -i".tmp3" "s/'facetFilters':.*$/'facetFilters': [\"version:$RELEASE_VERSION\"]/g" docs/_config.yml
+ sed -i".tmp4" 's/__version__: str = .*$/__version__: str = "'"$RELEASE_VERSION"'"/' python/pyspark/version.py
+ cd docs
+ SKIP_RDOC=1 bundle exec jekyll build
+ - name: Setup Pages
+ uses: actions/configure-pages@v5
+ - name: Upload artifact
+ uses: actions/upload-pages-artifact@v3
+ with:
+ path: 'docs/_site'
+ - name: Deploy to GitHub Pages
+ id: deployment
+ uses: actions/deploy-pages@v4
diff --git a/.github/workflows/publish_snapshot.yml b/.github/workflows/publish_snapshot.yml
index 1b5bd0ba61288..a5854d96a4d1a 100644
--- a/.github/workflows/publish_snapshot.yml
+++ b/.github/workflows/publish_snapshot.yml
@@ -28,7 +28,7 @@ on:
description: 'list of branches to publish (JSON)'
required: true
# keep in sync with default value of strategy matrix 'branch'
- default: '["master", "branch-3.5", "branch-3.4"]'
+ default: '["master", "branch-3.5"]'
jobs:
publish-snapshot:
@@ -38,7 +38,7 @@ jobs:
fail-fast: false
matrix:
# keep in sync with default value of workflow_dispatch input 'branch'
- branch: ${{ fromJSON( inputs.branch || '["master", "branch-3.5", "branch-3.4"]' ) }}
+ branch: ${{ fromJSON( inputs.branch || '["master", "branch-3.5"]' ) }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
@@ -52,13 +52,13 @@ jobs:
restore-keys: |
snapshot-maven-
- name: Install Java 8 for branch-3.x
- if: matrix.branch == 'branch-3.5' || matrix.branch == 'branch-3.4'
+ if: matrix.branch == 'branch-3.5'
uses: actions/setup-java@v4
with:
distribution: temurin
java-version: 8
- name: Install Java 17
- if: matrix.branch != 'branch-3.5' && matrix.branch != 'branch-3.4'
+ if: matrix.branch != 'branch-3.5'
uses: actions/setup-java@v4
with:
distribution: temurin
diff --git a/.github/workflows/python_macos_test.yml b/.github/workflows/python_macos_test.yml
new file mode 100644
index 0000000000000..cca133dab541a
--- /dev/null
+++ b/.github/workflows/python_macos_test.yml
@@ -0,0 +1,162 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: Build and test PySpark on macOS
+
+on:
+ workflow_call:
+ inputs:
+ java:
+ required: false
+ type: string
+ default: 17
+ python:
+ required: false
+ type: string
+ default: 3.11
+ branch:
+ description: Branch to run the build against
+ required: false
+ type: string
+ default: master
+ hadoop:
+ description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it.
+ required: false
+ type: string
+ default: hadoop3
+ envs:
+ description: Additional environment variables to set when running the tests. Should be in JSON format.
+ required: false
+ type: string
+ default: '{}'
+jobs:
+ build:
+ name: "PySpark test on macos: ${{ matrix.modules }}"
+ runs-on: macos-15
+ strategy:
+ fail-fast: false
+ matrix:
+ java:
+ - ${{ inputs.java }}
+ python:
+ - ${{inputs.python}}
+ modules:
+ - >-
+ pyspark-sql, pyspark-resource, pyspark-testing
+ - >-
+ pyspark-core, pyspark-errors, pyspark-streaming
+ - >-
+ pyspark-mllib, pyspark-ml, pyspark-ml-connect
+ - >-
+ pyspark-connect
+ - >-
+ pyspark-pandas
+ - >-
+ pyspark-pandas-slow
+ - >-
+ pyspark-pandas-connect-part0
+ - >-
+ pyspark-pandas-connect-part1
+ - >-
+ pyspark-pandas-connect-part2
+ - >-
+ pyspark-pandas-connect-part3
+ env:
+ MODULES_TO_TEST: ${{ matrix.modules }}
+ PYTHON_TO_TEST: python${{inputs.python}}
+ HADOOP_PROFILE: ${{ inputs.hadoop }}
+ HIVE_PROFILE: hive2.3
+ # GitHub Actions' default miniconda to use in pip packaging test.
+ CONDA_PREFIX: /usr/share/miniconda
+ GITHUB_PREV_SHA: ${{ github.event.before }}
+ SPARK_LOCAL_IP: localhost
+ SKIP_UNIDOC: true
+ SKIP_MIMA: true
+ SKIP_PACKAGING: true
+ METASPACE_SIZE: 1g
+ BRANCH: ${{ inputs.branch }}
+ steps:
+ - name: Checkout Spark repository
+ uses: actions/checkout@v4
+ # In order to fetch changed files
+ with:
+ fetch-depth: 0
+ repository: apache/spark
+ ref: ${{ inputs.branch }}
+ - name: Sync the current branch with the latest in Apache Spark
+ if: github.repository != 'apache/spark'
+ run: |
+ echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
+ git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
+ git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
+ git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
+ # Cache local repositories. Note that GitHub Actions cache has a 10G limit.
+ - name: Cache SBT and Maven
+ uses: actions/cache@v4
+ with:
+ path: |
+ build/apache-maven-*
+ build/*.jar
+ ~/.sbt
+ key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
+ restore-keys: |
+ build-
+ - name: Cache Coursier local repository
+ uses: actions/cache@v4
+ with:
+ path: ~/.cache/coursier
+ key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
+ restore-keys: |
+ pyspark-coursier-
+ - name: Install Java ${{ matrix.java }}
+ uses: actions/setup-java@v4
+ with:
+ distribution: zulu
+ java-version: ${{ matrix.java }}
+ - name: Install Python packages (Python ${{matrix.python}})
+ run: |
+ python${{matrix.python}} -m pip install --ignore-installed 'blinker>=1.6.2'
+ python${{matrix.python}} -m pip install --ignore-installed 'six==1.16.0'
+ python${{matrix.python}} -m pip install numpy 'pyarrow>=15.0.0' 'six==1.16.0' 'pandas==2.2.3' scipy 'plotly>=4.8' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' unittest-xml-reporting && \
+ python${{matrix.python}} -m pip install 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.28.3' 'googleapis-common-protos==1.65.0' 'graphviz==0.20.3' && \
+ python${{matrix.python}} -m pip cache purge && \
+ python${{matrix.python}} -m pip list
+ # Run the tests.
+ - name: Run tests
+ env: ${{ fromJSON(inputs.envs) }}
+ run: |
+ if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then
+ export SKIP_PACKAGING=false
+ echo "Python Packaging Tests Enabled!"
+ fi
+ ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --python-executables "$PYTHON_TO_TEST"
+ - name: Upload test results to report
+ env: ${{ fromJSON(inputs.envs) }}
+ if: always()
+ uses: actions/upload-artifact@v4
+ with:
+ name: test-results-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }}
+ path: "**/target/test-reports/*.xml"
+ - name: Upload unit tests log files
+ env: ${{ fromJSON(inputs.envs) }}
+ if: ${{ !success() }}
+ uses: actions/upload-artifact@v4
+ with:
+ name: unit-tests-log-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }}
+ path: "**/target/unit-tests.log"
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index f270673844551..e2db95083efea 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -25,6 +25,7 @@ on:
jobs:
stale:
+ if: github.repository == 'apache/spark'
runs-on: ubuntu-latest
steps:
- uses: actions/stale@c201d45ef4b0ccbd3bb0616f93bae13e73d0a080 # pin@v1.1.0
diff --git a/.github/workflows/test_report.yml b/.github/workflows/test_report.yml
index c6225e6a1abe5..9ab69af42c818 100644
--- a/.github/workflows/test_report.yml
+++ b/.github/workflows/test_report.yml
@@ -30,14 +30,14 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Download test results to report
- uses: dawidd6/action-download-artifact@09385b76de790122f4da9c82b17bccf858b9557c # pin@v2
+ uses: dawidd6/action-download-artifact@bf251b5aa9c2f7eeb574a96ee720e24f801b7c11 # pin @v6
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
workflow: ${{ github.event.workflow_run.workflow_id }}
commit: ${{ github.event.workflow_run.head_commit.id }}
workflow_conclusion: completed
- name: Publish test report
- uses: scacap/action-surefire-report@482f012643ed0560e23ef605a79e8e87ca081648 # pin@v1
+ uses: scacap/action-surefire-report@a2911bd1a4412ec18dde2d93b1758b3e56d2a880 # pin @v1.8.0
with:
check_name: Report test results
github_token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.gitignore b/.gitignore
index 787eb6180c35c..0a4138ec26948 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@
*.swp
*~
.java-version
+.python-version
.DS_Store
.ammonite
.bloop
@@ -26,6 +27,7 @@
.scala_dependencies
.settings
.vscode
+artifacts/
/lib/
R-unit-tests.log
R/unit-tests.out
diff --git a/.nojekyll b/.nojekyll
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/LICENSE-binary b/LICENSE-binary
index b6971798e5577..40d28fbe71e6b 100644
--- a/LICENSE-binary
+++ b/LICENSE-binary
@@ -267,6 +267,7 @@ io.fabric8:kubernetes-model-scheduling
io.fabric8:kubernetes-model-storageclass
io.fabric8:zjsonpatch
io.github.java-diff-utils:java-diff-utils
+io.jsonwebtoken:jjwt-api
io.netty:netty-all
io.netty:netty-buffer
io.netty:netty-codec
@@ -401,7 +402,6 @@ org.xerial.snappy:snappy-java
org.yaml:snakeyaml
oro:oro
stax:stax-api
-xerces:xercesImpl
core/src/main/java/org/apache/spark/util/collection/TimSort.java
core/src/main/resources/org/apache/spark/ui/static/bootstrap*
@@ -421,6 +421,11 @@ Python Software Foundation License
python/pyspark/loose_version.py
+BSD 0-Clause
+------------
+org.tukaani:xz
+
+
BSD 2-Clause
------------
com.github.luben:zstd-jni
@@ -507,7 +512,6 @@ Eclipse Distribution License (EDL) 1.0
com.sun.istack:istack-commons-runtime
jakarta.xml.bind:jakarta.xml.bind-api
org.glassfish.jaxb:jaxb-runtime
-org.glassfish.jaxb:txw2
Eclipse Public License (EPL) 2.0
--------------------------------
@@ -520,12 +524,6 @@ org.glassfish.hk2:hk2-locator
org.glassfish.hk2:hk2-utils
org.glassfish.hk2:osgi-resource-locator
-
-Public Domain
--------------
-org.tukaani:xz
-
-
Creative Commons CC0 1.0 Universal Public Domain Dedication
-----------------------------------------------------------
(see LICENSE-CC0.txt)
diff --git a/NOTICE-binary b/NOTICE-binary
index c4cfe0e9f8b31..3f36596b9d6d6 100644
--- a/NOTICE-binary
+++ b/NOTICE-binary
@@ -448,27 +448,6 @@ which has the following notices:
* Alec Wysoker
* Performance and memory usage improvement
-The binary distribution of this product bundles binaries of
-Xerces2 Java Parser 2.9.1,
-which has the following notices:
- * =========================================================================
- == NOTICE file corresponding to section 4(d) of the Apache License, ==
- == Version 2.0, in this case for the Apache Xerces Java distribution. ==
- =========================================================================
-
- Apache Xerces Java
- Copyright 1999-2007 The Apache Software Foundation
-
- This product includes software developed at
- The Apache Software Foundation (http://www.apache.org/).
-
- Portions of this software were originally based on the following:
- - software copyright (c) 1999, IBM Corporation., http://www.ibm.com.
- - software copyright (c) 1999, Sun Microsystems., http://www.sun.com.
- - voluntary contributions made by Paul Eng on behalf of the
- Apache Software Foundation that were originally developed at iClick, Inc.,
- software copyright (c) 1999.
-
Apache Commons Collections
Copyright 2001-2015 The Apache Software Foundation
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index f7dd261c10fd2..49000c62d1063 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -57,6 +57,7 @@ Collate:
'types.R'
'utils.R'
'window.R'
+ 'zzz.R'
RoxygenNote: 7.1.2
VignetteBuilder: knitr
NeedsCompilation: no
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index b91124f96a6fa..9c825a99be180 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -3965,19 +3965,11 @@ setMethod("row_number",
#' yields unresolved \code{a.b.c}
#' @return Column object wrapping JVM UnresolvedNamedLambdaVariable
#' @keywords internal
-unresolved_named_lambda_var <- function(...) {
- jc <- newJObject(
- "org.apache.spark.sql.Column",
- newJObject(
- "org.apache.spark.sql.catalyst.expressions.UnresolvedNamedLambdaVariable",
- lapply(list(...), function(x) {
- handledCallJStatic(
- "org.apache.spark.sql.catalyst.expressions.UnresolvedNamedLambdaVariable",
- "freshVarName",
- x)
- })
- )
- )
+unresolved_named_lambda_var <- function(name) {
+ jc <- handledCallJStatic(
+ "org.apache.spark.sql.api.python.PythonSQLUtils",
+ "unresolvedNamedLambdaVariable",
+ name)
column(jc)
}
@@ -3990,7 +3982,6 @@ unresolved_named_lambda_var <- function(...) {
#' @return JVM \code{LambdaFunction} object
#' @keywords internal
create_lambda <- function(fun) {
- as_jexpr <- function(x) callJMethod(x@jc, "expr")
# Process function arguments
parameters <- formals(fun)
@@ -4011,22 +4002,18 @@ create_lambda <- function(fun) {
stopifnot(class(result) == "Column")
# Convert both Columns to Scala expressions
- jexpr <- as_jexpr(result)
-
jargs <- handledCallJStatic(
"org.apache.spark.api.python.PythonUtils",
"toSeq",
- handledCallJStatic(
- "java.util.Arrays", "asList", lapply(args, as_jexpr)
- )
+ handledCallJStatic("java.util.Arrays", "asList", lapply(args, function(x) { x@jc }))
)
# Create Scala LambdaFunction
- newJObject(
- "org.apache.spark.sql.catalyst.expressions.LambdaFunction",
- jexpr,
- jargs,
- FALSE
+ handledCallJStatic(
+ "org.apache.spark.sql.api.python.PythonSQLUtils",
+ "lambdaFunction",
+ result@jc,
+ jargs
)
}
@@ -4039,20 +4026,18 @@ create_lambda <- function(fun) {
#' @return a \code{Column} representing name applied to cols with funs
#' @keywords internal
invoke_higher_order_function <- function(name, cols, funs) {
- as_jexpr <- function(x) {
+ as_col <- function(x) {
if (class(x) == "character") {
x <- column(x)
}
- callJMethod(x@jc, "expr")
+ x@jc
}
-
- jexpr <- do.call(newJObject, c(
- paste("org.apache.spark.sql.catalyst.expressions", name, sep = "."),
- lapply(cols, as_jexpr),
- lapply(funs, create_lambda)
- ))
-
- column(newJObject("org.apache.spark.sql.Column", jexpr))
+ jcol <- handledCallJStatic(
+ "org.apache.spark.sql.api.python.PythonSQLUtils",
+ "fn",
+ name,
+ c(lapply(cols, as_col), lapply(funs, create_lambda))) # check varargs invocation
+ column(jcol)
}
#' @details
@@ -4068,7 +4053,7 @@ setMethod("array_aggregate",
signature(x = "characterOrColumn", initialValue = "Column", merge = "function"),
function(x, initialValue, merge, finish = NULL) {
invoke_higher_order_function(
- "ArrayAggregate",
+ "aggregate",
cols = list(x, initialValue),
funs = if (is.null(finish)) {
list(merge)
@@ -4129,7 +4114,7 @@ setMethod("array_exists",
signature(x = "characterOrColumn", f = "function"),
function(x, f) {
invoke_higher_order_function(
- "ArrayExists",
+ "exists",
cols = list(x),
funs = list(f)
)
@@ -4145,7 +4130,7 @@ setMethod("array_filter",
signature(x = "characterOrColumn", f = "function"),
function(x, f) {
invoke_higher_order_function(
- "ArrayFilter",
+ "filter",
cols = list(x),
funs = list(f)
)
@@ -4161,7 +4146,7 @@ setMethod("array_forall",
signature(x = "characterOrColumn", f = "function"),
function(x, f) {
invoke_higher_order_function(
- "ArrayForAll",
+ "forall",
cols = list(x),
funs = list(f)
)
@@ -4291,7 +4276,7 @@ setMethod("array_sort",
column(callJStatic("org.apache.spark.sql.functions", "array_sort", x@jc))
} else {
invoke_higher_order_function(
- "ArraySort",
+ "array_sort",
cols = list(x),
funs = list(comparator)
)
@@ -4309,7 +4294,7 @@ setMethod("array_transform",
signature(x = "characterOrColumn", f = "function"),
function(x, f) {
invoke_higher_order_function(
- "ArrayTransform",
+ "transform",
cols = list(x),
funs = list(f)
)
@@ -4374,7 +4359,7 @@ setMethod("arrays_zip_with",
signature(x = "characterOrColumn", y = "characterOrColumn", f = "function"),
function(x, y, f) {
invoke_higher_order_function(
- "ZipWith",
+ "zip_with",
cols = list(x, y),
funs = list(f)
)
@@ -4447,7 +4432,7 @@ setMethod("map_filter",
signature(x = "characterOrColumn", f = "function"),
function(x, f) {
invoke_higher_order_function(
- "MapFilter",
+ "map_filter",
cols = list(x),
funs = list(f))
})
@@ -4504,7 +4489,7 @@ setMethod("transform_keys",
signature(x = "characterOrColumn", f = "function"),
function(x, f) {
invoke_higher_order_function(
- "TransformKeys",
+ "transform_keys",
cols = list(x),
funs = list(f)
)
@@ -4521,7 +4506,7 @@ setMethod("transform_values",
signature(x = "characterOrColumn", f = "function"),
function(x, f) {
invoke_higher_order_function(
- "TransformValues",
+ "transform_values",
cols = list(x),
funs = list(f)
)
@@ -4552,7 +4537,7 @@ setMethod("map_zip_with",
signature(x = "characterOrColumn", y = "characterOrColumn", f = "function"),
function(x, y, f) {
invoke_higher_order_function(
- "MapZipWith",
+ "map_zip_with",
cols = list(x, y),
funs = list(f)
)
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 0be7e5da24d23..1b5faad376eaa 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -403,7 +403,6 @@ sparkR.session <- function(
sparkPackages = "",
enableHiveSupport = TRUE,
...) {
-
sparkConfigMap <- convertNamedListToEnv(sparkConfig)
namedParams <- list(...)
if (length(namedParams) > 0) {
diff --git a/connector/docker/spark-test/base/Dockerfile b/R/pkg/R/zzz.R
similarity index 64%
rename from connector/docker/spark-test/base/Dockerfile
rename to R/pkg/R/zzz.R
index c397abc211e24..947bd543b75e0 100644
--- a/connector/docker/spark-test/base/Dockerfile
+++ b/R/pkg/R/zzz.R
@@ -14,15 +14,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
+# zzz.R - package startup message
-FROM ubuntu:20.04
-
-# Upgrade package index
-# install a few other useful packages plus Open Java 17
-# Remove unneeded /var/lib/apt/lists/* after install to reduce the
-# docker image size (by ~30MB)
-RUN apt-get update && \
- apt-get install -y less openjdk-17-jre-headless iproute2 vim-tiny sudo openssh-server && \
- rm -rf /var/lib/apt/lists/*
-
-ENV SPARK_HOME /opt/spark
+.onAttach <- function(...) {
+ if (Sys.getenv("SPARKR_SUPPRESS_DEPRECATION_WARNING") == "") {
+ packageStartupMessage(
+ paste0(
+ "Warning: ",
+ "SparkR is deprecated in Apache Spark 4.0.0 and will be removed in a future release. ",
+ "To continue using Spark in R, we recommend using sparklyr instead: ",
+ "https://spark.posit.co/get-started/"
+ )
+ )
+ }
+}
diff --git a/R/pkg/README.md b/R/pkg/README.md
index da9f042b4fded..c05a75812245c 100644
--- a/R/pkg/README.md
+++ b/R/pkg/README.md
@@ -1,4 +1,4 @@
-# R on Spark
+# R on Spark (deprecated)
SparkR is an R package that provides a light-weight frontend to use Spark from R.
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index c44924e55087f..c93b92edbff8e 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -4152,7 +4152,8 @@ test_that("catalog APIs, listTables, getTable, listColumns, listFunctions, funct
c <- listColumns("cars")
expect_equal(nrow(c), 2)
expect_equal(colnames(c),
- c("name", "description", "dataType", "nullable", "isPartition", "isBucket"))
+ c("name", "description", "dataType", "nullable", "isPartition", "isBucket",
+ "isCluster"))
expect_equal(collect(c)[[1]][[1]], "speed")
expect_error(listColumns("zxwtyswklpf", "default"),
"[TABLE_OR_VIEW_NOT_FOUND]*`spark_catalog`.`default`.`zxwtyswklpf`*")
diff --git a/R/pkg/tests/fulltests/test_streaming.R b/R/pkg/tests/fulltests/test_streaming.R
index 88114f8bd82b8..74734746c129f 100644
--- a/R/pkg/tests/fulltests/test_streaming.R
+++ b/R/pkg/tests/fulltests/test_streaming.R
@@ -147,8 +147,7 @@ test_that("Unsupported operation", {
# memory sink without aggregation
df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1)
expect_error(write.stream(df, "memory", queryName = "people", outputMode = "complete"),
- paste0(".*(start : analysis error - Complete output mode not supported when there ",
- "are no streaming aggregations on streaming DataFrames/Datasets).*"))
+ ".*analysis error.*complete.*not supported.*no streaming aggregations*")
})
test_that("Terminated by error", {
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 4a510763afb6c..338b74110fb65 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -52,6 +52,8 @@ old_java_opt <- Sys.getenv("_JAVA_OPTIONS")
Sys.setenv("_JAVA_OPTIONS" = paste("-XX:-UsePerfData", old_java_opt, sep = " "))
```
+SparkR is deprecated from Apache Spark 4.0.0 and will be removed in a future version.
+
## Overview
SparkR is an R package that provides a light-weight frontend to use Apache Spark from R. With Spark `r packageVersion("SparkR")`, SparkR provides a distributed data frame implementation that supports data processing operations like selection, filtering, aggregation etc. and distributed machine learning using [MLlib](https://spark.apache.org/mllib/).
diff --git a/R/run-tests.sh b/R/run-tests.sh
index 90a60eda03871..3a90b44c2b659 100755
--- a/R/run-tests.sh
+++ b/R/run-tests.sh
@@ -30,9 +30,9 @@ if [[ $(echo $SPARK_AVRO_JAR_PATH | wc -l) -eq 1 ]]; then
fi
if [ -z "$SPARK_JARS" ]; then
- SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configurationFile=file:$FWDIR/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
+ SPARKR_SUPPRESS_DEPRECATION_WARNING=1 SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configurationFile=file:$FWDIR/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
else
- SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --jars $SPARK_JARS --driver-java-options "-Dlog4j.configurationFile=file:$FWDIR/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
+ SPARKR_SUPPRESS_DEPRECATION_WARNING=1 SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --jars $SPARK_JARS --driver-java-options "-Dlog4j.configurationFile=file:$FWDIR/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
fi
FAILED=$((PIPESTATUS[0]||$FAILED))
diff --git a/README.md b/README.md
index b9a20075f6a17..552b71215cb92 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,8 @@ rich set of higher-level tools including Spark SQL for SQL and DataFrames,
pandas API on Spark for pandas workloads, MLlib for machine learning, GraphX for graph processing,
and Structured Streaming for stream processing.
-
+- Official version:
+- Development version:
[![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_main.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_main.yml)
[![PySpark Coverage](https://codecov.io/gh/apache/spark/branch/master/graph/badge.svg)](https://codecov.io/gh/apache/spark)
diff --git a/assembly/README b/assembly/README
index ad1305c5b4d56..10c8254ae153e 100644
--- a/assembly/README
+++ b/assembly/README
@@ -9,4 +9,4 @@ This module is off by default. To activate it specify the profile in the command
If you need to build an assembly for a different version of Hadoop the
hadoop-version system property needs to be set as in this example:
- -Dhadoop.version=3.4.0
+ -Dhadoop.version=3.4.1
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 9377849cf1cdc..17bb81fa023ba 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -109,11 +109,27 @@
${project.version}provided
+
+
+ org.apache.spark
+ spark-connect-client-jvm_${scala.binary.version}
+ ${project.version}
+
+
+ org.apache.spark
+ spark-connect-shims_${scala.binary.version}
+
+
+ provided
+ com.google.guava
@@ -159,6 +175,44 @@
+
+
+ org.codehaus.mojo
+ exec-maven-plugin
+
+
+ copy-connect-client-repl-jars
+ package
+
+ exec
+
+
+ cp
+
+ -r
+ ${basedir}/../connector/connect/client/jvm/target/connect-repl
+ ${basedir}/target/scala-${scala.binary.version}/jars/
+
+
+
+
+ copy-connect-client-jar
+ package
+
+ exec
+
+
+ cp
+
+ ${basedir}/../connector/connect/client/jvm/target/spark-connect-client-jvm_${scala.binary.version}-${project.version}.jar
+ ${basedir}/target/scala-${scala.binary.version}/jars/connect-repl
+
+
+
+
+
@@ -291,6 +345,14 @@
+
+
+ jjwt
+
+ compile
+
+
+
diff --git a/bin/docker-image-tool.sh b/bin/docker-image-tool.sh
index a137a2fba52ee..efbd63a3037d6 100755
--- a/bin/docker-image-tool.sh
+++ b/bin/docker-image-tool.sh
@@ -261,18 +261,18 @@ Examples:
$0 -m -t testing build
- Build PySpark docker image
- $0 -r docker.io/myrepo -t v3.4.0 -p kubernetes/dockerfiles/spark/bindings/python/Dockerfile build
+ $0 -r docker.io/myrepo -t v4.0.0 -p kubernetes/dockerfiles/spark/bindings/python/Dockerfile build
- - Build and push image with tag "v3.4.0" to docker.io/myrepo
- $0 -r docker.io/myrepo -t v3.4.0 build
- $0 -r docker.io/myrepo -t v3.4.0 push
+ - Build and push image with tag "v4.0.0" to docker.io/myrepo
+ $0 -r docker.io/myrepo -t v4.0.0 build
+ $0 -r docker.io/myrepo -t v4.0.0 push
- - Build and push Java11-based image with tag "v3.4.0" to docker.io/myrepo
- $0 -r docker.io/myrepo -t v3.4.0 -b java_image_tag=11-jre build
- $0 -r docker.io/myrepo -t v3.4.0 push
+ - Build and push Java17-based image with tag "v4.0.0" to docker.io/myrepo
+ $0 -r docker.io/myrepo -t v4.0.0 -b java_image_tag=17 build
+ $0 -r docker.io/myrepo -t v4.0.0 push
- Build and push image for multiple archs to docker.io/myrepo
- $0 -r docker.io/myrepo -t v3.4.0 -X build
+ $0 -r docker.io/myrepo -t v4.0.0 -X build
# Note: buildx, which does cross building, needs to do the push during build
# So there is no separate push step with -X
diff --git a/bin/spark-shell b/bin/spark-shell
index e920137974980..8411158187260 100755
--- a/bin/spark-shell
+++ b/bin/spark-shell
@@ -34,7 +34,7 @@ fi
export _SPARK_CMD_USAGE="Usage: ./bin/spark-shell [options]
-Scala REPL options:
+Scala REPL options, Spark Classic only:
-I preload , enforcing line-by-line interpretation"
# SPARK-4161: scala does not assume use of the java classpath,
@@ -45,6 +45,9 @@ Scala REPL options:
SPARK_SUBMIT_OPTS="$SPARK_SUBMIT_OPTS -Dscala.usejavacp=true"
function main() {
+ export SPARK_SCALA_SHELL=1
+ # In case of Spark Connect shell, the main class (and resource) is replaced in
+ # SparkSubmitCommandBuilder.
if $cygwin; then
# Workaround for issue involving JLine and Cygwin
# (see http://sourceforge.net/p/jline/bugs/40/).
diff --git a/binder/Dockerfile b/binder/Dockerfile
new file mode 100644
index 0000000000000..6e3dd9155fb7a
--- /dev/null
+++ b/binder/Dockerfile
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM python:3.10-slim
+# install the notebook package
+RUN pip install --no-cache notebook jupyterlab
+
+# create user with a home directory
+ARG NB_USER
+ARG NB_UID
+ENV USER ${NB_USER}
+ENV HOME /home/${NB_USER}
+
+RUN adduser --disabled-password \
+ --gecos "Default user" \
+ --uid ${NB_UID} \
+ ${NB_USER}
+WORKDIR ${HOME}
+USER ${USER}
+
+# Make sure the contents of our repo are in ${HOME}
+COPY . ${HOME}
+USER root
+RUN chown -R ${NB_UID} ${HOME}
+RUN apt-get update && apt-get install -y openjdk-17-jre git coreutils
+USER ${NB_USER}
+
+RUN binder/postBuild
+
diff --git a/binder/apt.txt b/binder/apt.txt
deleted file mode 100644
index aa441a15db3bd..0000000000000
--- a/binder/apt.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-openjdk-17-jre
-git
diff --git a/binder/postBuild b/binder/postBuild
index 26e23b5a940ae..0b326f62e8f51 100644
--- a/binder/postBuild
+++ b/binder/postBuild
@@ -26,7 +26,7 @@ set -o pipefail
set -e
VERSION=$(python -c "exec(open('python/pyspark/version.py').read()); print(__version__)")
-TAG=$(git describe --tags --exact-match 2>/dev/null)
+TAG=$(git describe --tags --exact-match 2> /dev/null || true)
# If a commit is tagged, exactly specified version of pyspark should be installed to avoid
# a kind of accident that an old version of pyspark is installed in the live notebook environment.
diff --git a/build/mvn b/build/mvn
index 3735461562e54..fef589fc03476 100755
--- a/build/mvn
+++ b/build/mvn
@@ -56,9 +56,9 @@ install_app() {
local binary="${_DIR}/$6"
local remote_tarball="${mirror_host}/${url_path}${url_query}"
local local_checksum="${local_tarball}.${checksum_suffix}"
- local remote_checksum="https://archive.apache.org/dist/${url_path}.${checksum_suffix}"
+ local remote_checksum="${mirror_host}/${url_path}.${checksum_suffix}${url_query}"
- local curl_opts="--silent --show-error -L"
+ local curl_opts="--retry 3 --silent --show-error -L"
local wget_opts="--no-verbose"
if [ ! -f "$binary" ]; then
@@ -115,6 +115,10 @@ function version { echo "$@" | awk -F. '{ printf("%03d%03d%03d\n", $1,$2,$3); }'
# install maven under the build/ folder if needed.
install_mvn() {
local MVN_VERSION=`grep "" "${_DIR}/../pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'`
+ MVN_BIN="${_DIR}/apache-maven-${MVN_VERSION}/bin/mvn"
+ if [ -f "$MVN_BIN" ]; then
+ return
+ fi
MVN_BIN="$(command -v mvn)"
if [ "$MVN_BIN" ]; then
local MVN_DETECTED_VERSION="$(mvn --version | head -n1 | awk '{print $3}')"
diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
index 13a9d89f4705c..7f8d6c58aec7e 100644
--- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
@@ -255,7 +255,8 @@ public Iterator iterator() {
iteratorTracker.add(new WeakReference<>(it));
return it;
} catch (Exception e) {
- throw Throwables.propagate(e);
+ Throwables.throwIfUnchecked(e);
+ throw new RuntimeException(e);
}
}
};
diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java
index 69757fdc65d68..29ed37ffa44e5 100644
--- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java
@@ -127,7 +127,7 @@ public boolean hasNext() {
try {
close();
} catch (IOException ioe) {
- throw Throwables.propagate(ioe);
+ throw new RuntimeException(ioe);
}
}
return next != null;
@@ -151,7 +151,8 @@ public T next() {
next = null;
return ret;
} catch (Exception e) {
- throw Throwables.propagate(e);
+ Throwables.throwIfUnchecked(e);
+ throw new RuntimeException(e);
}
}
diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/RocksDB.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/RocksDB.java
index dc7ad0be5c007..4bc2b233fe12d 100644
--- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/RocksDB.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/RocksDB.java
@@ -287,7 +287,8 @@ public Iterator iterator() {
iteratorTracker.add(new WeakReference<>(it));
return it;
} catch (Exception e) {
- throw Throwables.propagate(e);
+ Throwables.throwIfUnchecked(e);
+ throw new RuntimeException(e);
}
}
};
diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/RocksDBIterator.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/RocksDBIterator.java
index a98b0482e35cc..e350ddc2d445a 100644
--- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/RocksDBIterator.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/RocksDBIterator.java
@@ -113,7 +113,7 @@ public boolean hasNext() {
try {
close();
} catch (IOException ioe) {
- throw Throwables.propagate(ioe);
+ throw new RuntimeException(ioe);
}
}
return next != null;
@@ -137,7 +137,8 @@ public T next() {
next = null;
return ret;
} catch (Exception e) {
- throw Throwables.propagate(e);
+ Throwables.throwIfUnchecked(e);
+ throw new RuntimeException(e);
}
}
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index cdb5bd72158a1..cbe4836b58da5 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -194,6 +194,16 @@
mockito-coretest
+
+ net.bytebuddy
+ byte-buddy
+ test
+
+
+ net.bytebuddy
+ byte-buddy-agent
+ test
+
diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java
index 4c144a73a9299..a9df47645d36f 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java
@@ -290,9 +290,11 @@ public void onFailure(Throwable e) {
try {
return result.get(timeoutMs, TimeUnit.MILLISECONDS);
} catch (ExecutionException e) {
- throw Throwables.propagate(e.getCause());
+ Throwables.throwIfUnchecked(e.getCause());
+ throw new RuntimeException(e.getCause());
} catch (Exception e) {
- throw Throwables.propagate(e);
+ Throwables.throwIfUnchecked(e);
+ throw new RuntimeException(e);
}
}
diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java
index e1f19f956cc0a..d64b8c8f838e9 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java
@@ -342,7 +342,8 @@ public void operationComplete(final Future handshakeFuture) {
logger.error("Exception while bootstrapping client after {} ms", e,
MDC.of(LogKeys.BOOTSTRAP_TIME$.MODULE$, bootstrapTimeMs));
client.close();
- throw Throwables.propagate(e);
+ Throwables.throwIfUnchecked(e);
+ throw new RuntimeException(e);
}
long postBootstrap = System.nanoTime();
diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthClientBootstrap.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthClientBootstrap.java
index 08e2c084fe67b..2e9ccd0e0ad21 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthClientBootstrap.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthClientBootstrap.java
@@ -22,7 +22,6 @@
import java.security.GeneralSecurityException;
import java.util.concurrent.TimeoutException;
-import com.google.common.base.Throwables;
import io.netty.buffer.ByteBuf;
import io.netty.buffer.Unpooled;
import io.netty.channel.Channel;
@@ -80,7 +79,7 @@ public void doBootstrap(TransportClient client, Channel channel) {
doSparkAuth(client, channel);
client.setClientId(appId);
} catch (GeneralSecurityException | IOException e) {
- throw Throwables.propagate(e);
+ throw new RuntimeException(e);
} catch (RuntimeException e) {
// There isn't a good exception that can be caught here to know whether it's really
// OK to switch back to SASL (because the server doesn't speak the new protocol). So
diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java
index 65367743e24f9..087e3d21e22bb 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java
@@ -132,7 +132,8 @@ protected boolean doAuthChallenge(
try {
engine.close();
} catch (Exception e) {
- throw Throwables.propagate(e);
+ Throwables.throwIfUnchecked(e);
+ throw new RuntimeException(e);
}
}
}
diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipher.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipher.java
index 355c552720185..33494aee4444d 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipher.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipher.java
@@ -17,32 +17,12 @@
package org.apache.spark.network.crypto;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.crypto.tink.subtle.Hex;
-import com.google.crypto.tink.subtle.Hkdf;
import io.netty.channel.Channel;
-import javax.crypto.spec.SecretKeySpec;
import java.io.IOException;
-import java.nio.charset.StandardCharsets;
import java.security.GeneralSecurityException;
interface TransportCipher {
String getKeyId() throws GeneralSecurityException;
void addToChannel(Channel channel) throws IOException, GeneralSecurityException;
}
-
-class TransportCipherUtil {
- /*
- * This method is used for testing to verify key derivation.
- */
- @VisibleForTesting
- static String getKeyId(SecretKeySpec key) throws GeneralSecurityException {
- byte[] keyIdBytes = Hkdf.computeHkdf("HmacSha256",
- key.getEncoded(),
- null,
- "keyID".getBytes(StandardCharsets.UTF_8),
- 32);
- return Hex.encode(keyIdBytes);
- }
-}
diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipherUtil.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipherUtil.java
new file mode 100644
index 0000000000000..1df2732f240cc
--- /dev/null
+++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipherUtil.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.crypto;
+
+import java.nio.charset.StandardCharsets;
+import java.security.GeneralSecurityException;
+import javax.crypto.spec.SecretKeySpec;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.crypto.tink.subtle.Hex;
+import com.google.crypto.tink.subtle.Hkdf;
+
+class TransportCipherUtil {
+ /**
+ * This method is used for testing to verify key derivation.
+ */
+ @VisibleForTesting
+ static String getKeyId(SecretKeySpec key) throws GeneralSecurityException {
+ byte[] keyIdBytes = Hkdf.computeHkdf("HmacSha256",
+ key.getEncoded(),
+ null,
+ "keyID".getBytes(StandardCharsets.UTF_8),
+ 32);
+ return Hex.encode(keyIdBytes);
+ }
+}
diff --git a/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslClient.java b/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslClient.java
index 3600c1045dbf4..a61b1c3c0c416 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslClient.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslClient.java
@@ -29,7 +29,6 @@
import javax.security.sasl.SaslClient;
import javax.security.sasl.SaslException;
-import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableMap;
import org.apache.spark.internal.SparkLogger;
@@ -62,7 +61,7 @@ public SparkSaslClient(String secretKeyId, SecretKeyHolder secretKeyHolder, bool
this.saslClient = Sasl.createSaslClient(new String[] { DIGEST }, null, null, DEFAULT_REALM,
saslProps, new ClientCallbackHandler());
} catch (SaslException e) {
- throw Throwables.propagate(e);
+ throw new RuntimeException(e);
}
}
@@ -72,7 +71,7 @@ public synchronized byte[] firstToken() {
try {
return saslClient.evaluateChallenge(new byte[0]);
} catch (SaslException e) {
- throw Throwables.propagate(e);
+ throw new RuntimeException(e);
}
} else {
return new byte[0];
@@ -98,7 +97,7 @@ public synchronized byte[] response(byte[] token) {
try {
return saslClient != null ? saslClient.evaluateChallenge(token) : new byte[0];
} catch (SaslException e) {
- throw Throwables.propagate(e);
+ throw new RuntimeException(e);
}
}
diff --git a/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslServer.java b/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslServer.java
index b897650afe832..f32fd5145c7c5 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslServer.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslServer.java
@@ -31,7 +31,6 @@
import java.util.Map;
import com.google.common.base.Preconditions;
-import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableMap;
import io.netty.buffer.ByteBuf;
import io.netty.buffer.Unpooled;
@@ -94,7 +93,7 @@ public SparkSaslServer(
this.saslServer = Sasl.createSaslServer(DIGEST, null, DEFAULT_REALM, saslProps,
new DigestCallbackHandler());
} catch (SaslException e) {
- throw Throwables.propagate(e);
+ throw new RuntimeException(e);
}
}
@@ -119,7 +118,7 @@ public synchronized byte[] response(byte[] token) {
try {
return saslServer != null ? saslServer.evaluateResponse(token) : new byte[0];
} catch (SaslException e) {
- throw Throwables.propagate(e);
+ throw new RuntimeException(e);
}
}
diff --git a/common/network-common/src/main/java/org/apache/spark/network/shuffledb/LevelDBIterator.java b/common/network-common/src/main/java/org/apache/spark/network/shuffledb/LevelDBIterator.java
index 5796e34a6f05e..2ac549775449a 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/shuffledb/LevelDBIterator.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/shuffledb/LevelDBIterator.java
@@ -17,8 +17,6 @@
package org.apache.spark.network.shuffledb;
-import com.google.common.base.Throwables;
-
import java.io.IOException;
import java.util.Map;
import java.util.NoSuchElementException;
@@ -47,7 +45,7 @@ public boolean hasNext() {
try {
close();
} catch (IOException ioe) {
- throw Throwables.propagate(ioe);
+ throw new RuntimeException(ioe);
}
}
return next != null;
diff --git a/common/network-common/src/main/java/org/apache/spark/network/shuffledb/RocksDB.java b/common/network-common/src/main/java/org/apache/spark/network/shuffledb/RocksDB.java
index d33895d6c2d62..2737ab8ed754c 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/shuffledb/RocksDB.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/shuffledb/RocksDB.java
@@ -19,7 +19,6 @@
import java.io.IOException;
-import com.google.common.base.Throwables;
import org.rocksdb.RocksDBException;
/**
@@ -37,7 +36,7 @@ public void put(byte[] key, byte[] value) {
try {
db.put(key, value);
} catch (RocksDBException e) {
- throw Throwables.propagate(e);
+ throw new RuntimeException(e);
}
}
@@ -46,7 +45,7 @@ public byte[] get(byte[] key) {
try {
return db.get(key);
} catch (RocksDBException e) {
- throw Throwables.propagate(e);
+ throw new RuntimeException(e);
}
}
@@ -55,7 +54,7 @@ public void delete(byte[] key) {
try {
db.delete(key);
} catch (RocksDBException e) {
- throw Throwables.propagate(e);
+ throw new RuntimeException(e);
}
}
diff --git a/common/network-common/src/main/java/org/apache/spark/network/shuffledb/RocksDBIterator.java b/common/network-common/src/main/java/org/apache/spark/network/shuffledb/RocksDBIterator.java
index 78562f91a4b75..829a7ded6330b 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/shuffledb/RocksDBIterator.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/shuffledb/RocksDBIterator.java
@@ -22,7 +22,6 @@
import java.util.Map;
import java.util.NoSuchElementException;
-import com.google.common.base.Throwables;
import org.rocksdb.RocksIterator;
/**
@@ -52,7 +51,7 @@ public boolean hasNext() {
try {
close();
} catch (IOException ioe) {
- throw Throwables.propagate(ioe);
+ throw new RuntimeException(ioe);
}
}
return next != null;
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 0f7036ef746cc..49e6e08476151 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -113,6 +113,16 @@
mockito-coretest
+
+ net.bytebuddy
+ byte-buddy
+ test
+
+
+ net.bytebuddy
+ byte-buddy-agent
+ test
+ commons-iocommons-io
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/checksum/ShuffleChecksumHelper.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/checksum/ShuffleChecksumHelper.java
index f9c0c60c2f2c6..62fcda701d948 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/checksum/ShuffleChecksumHelper.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/checksum/ShuffleChecksumHelper.java
@@ -19,10 +19,7 @@
import java.io.*;
import java.util.concurrent.TimeUnit;
-import java.util.zip.Adler32;
-import java.util.zip.CRC32;
-import java.util.zip.CheckedInputStream;
-import java.util.zip.Checksum;
+import java.util.zip.*;
import com.google.common.io.ByteStreams;
@@ -66,6 +63,13 @@ private static Checksum[] getChecksumsByAlgorithm(int num, String algorithm) {
}
}
+ case "CRC32C" -> {
+ checksums = new CRC32C[num];
+ for (int i = 0; i < num; i++) {
+ checksums[i] = new CRC32C();
+ }
+ }
+
default -> throw new UnsupportedOperationException(
"Unsupported shuffle checksum algorithm: " + algorithm);
}
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index a5ef9847859a7..cf15301273303 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -104,6 +104,16 @@
mockito-coretest
+
+ net.bytebuddy
+ byte-buddy
+ test
+
+
+ net.bytebuddy
+ byte-buddy-agent
+ test
+ org.scalacheckscalacheck_${scala.binary.version}
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index b9868ca665a65..97c8bbe562aff 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -17,6 +17,7 @@
package org.apache.spark.sql.catalyst.util;
import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
@@ -26,15 +27,15 @@
import org.apache.spark.unsafe.UTF8StringBuilder;
import org.apache.spark.unsafe.types.UTF8String;
-import static org.apache.spark.unsafe.Platform.BYTE_ARRAY_OFFSET;
-import static org.apache.spark.unsafe.Platform.copyMemory;
import static org.apache.spark.unsafe.types.UTF8String.CodePointIteratorType;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
+import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
+import java.util.List;
import java.util.Map;
/**
@@ -48,19 +49,28 @@ public class CollationAwareUTF8String {
*/
private static final int MATCH_NOT_FOUND = -1;
+ /**
+ * `COMBINED_ASCII_SMALL_I_COMBINING_DOT` is an internal representation of the combined
+ * lowercase code point for ASCII lowercase letter i with an additional combining dot character
+ * (U+0307). This integer value is not a valid code point itself, but rather an artificial code
+ * point marker used to represent the two lowercase characters that are the result of converting
+ * the uppercase Turkish dotted letter I with a combining dot character (U+0130) to lowercase.
+ */
+ private static final int COMBINED_ASCII_SMALL_I_COMBINING_DOT =
+ SpecialCodePointConstants.ASCII_SMALL_I << 16 | SpecialCodePointConstants.COMBINING_DOT;
+
/**
* Returns whether the target string starts with the specified prefix, starting from the
* specified position (0-based index referring to character position in UTF8String), with respect
- * to the UTF8_LCASE collation. The method assumes that the prefix is already lowercased
- * prior to method call to avoid the overhead of calling .toLowerCase() multiple times on the
- * same prefix string.
+ * to the UTF8_LCASE collation. The method assumes that the prefix is already lowercased prior
+ * to method call to avoid the overhead of lowercasing the same prefix string multiple times.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param startPos the start position for searching (in the target string)
* @return whether the target string starts with the specified prefix in UTF8_LCASE
*/
- public static boolean lowercaseMatchFrom(
+ private static boolean lowercaseMatchFrom(
final UTF8String target,
final UTF8String lowercasePattern,
int startPos) {
@@ -86,12 +96,44 @@ private static int lowercaseMatchLengthFrom(
final UTF8String lowercasePattern,
int startPos) {
assert startPos >= 0;
- for (int len = 0; len <= target.numChars() - startPos; ++len) {
- if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) {
- return len;
+ // Use code point iterators for efficient string search.
+ Iterator targetIterator = target.codePointIterator();
+ Iterator patternIterator = lowercasePattern.codePointIterator();
+ // Skip to startPos in the target string.
+ for (int i = 0; i < startPos; ++i) {
+ if (targetIterator.hasNext()) {
+ targetIterator.next();
+ } else {
+ return MATCH_NOT_FOUND;
}
}
- return MATCH_NOT_FOUND;
+ // Compare the characters in the target and pattern strings.
+ int matchLength = 0, codePointBuffer = -1, targetCodePoint, patternCodePoint;
+ while ((targetIterator.hasNext() || codePointBuffer != -1) && patternIterator.hasNext()) {
+ if (codePointBuffer != -1) {
+ targetCodePoint = codePointBuffer;
+ codePointBuffer = -1;
+ } else {
+ // Use buffered lowercase code point iteration to handle one-to-many case mappings.
+ targetCodePoint = getLowercaseCodePoint(targetIterator.next());
+ if (targetCodePoint == COMBINED_ASCII_SMALL_I_COMBINING_DOT) {
+ targetCodePoint = SpecialCodePointConstants.ASCII_SMALL_I;
+ codePointBuffer = SpecialCodePointConstants.COMBINING_DOT;
+ }
+ ++matchLength;
+ }
+ patternCodePoint = patternIterator.next();
+ if (targetCodePoint != patternCodePoint) {
+ return MATCH_NOT_FOUND;
+ }
+ }
+ // If the pattern string has more characters, or the match is found at the middle of a
+ // character that maps to multiple characters in lowercase, then match is not found.
+ if (patternIterator.hasNext() || codePointBuffer != -1) {
+ return MATCH_NOT_FOUND;
+ }
+ // If all characters are equal, return the length of the match in the target string.
+ return matchLength;
}
/**
@@ -123,15 +165,14 @@ private static int lowercaseFind(
* Returns whether the target string ends with the specified suffix, ending at the specified
* position (0-based index referring to character position in UTF8String), with respect to the
* UTF8_LCASE collation. The method assumes that the suffix is already lowercased prior
- * to method call to avoid the overhead of calling .toLowerCase() multiple times on the same
- * suffix string.
+ * to method call to avoid the overhead of lowercasing the same suffix string multiple times.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param endPos the end position for searching (in the target string)
* @return whether the target string ends with the specified suffix in lowercase
*/
- public static boolean lowercaseMatchUntil(
+ private static boolean lowercaseMatchUntil(
final UTF8String target,
final UTF8String lowercasePattern,
int endPos) {
@@ -156,13 +197,45 @@ private static int lowercaseMatchLengthUntil(
final UTF8String target,
final UTF8String lowercasePattern,
int endPos) {
- assert endPos <= target.numChars();
- for (int len = 0; len <= endPos; ++len) {
- if (target.substring(endPos - len, endPos).toLowerCase().equals(lowercasePattern)) {
- return len;
+ assert endPos >= 0;
+ // Use code point iterators for efficient string search.
+ Iterator targetIterator = target.reverseCodePointIterator();
+ Iterator patternIterator = lowercasePattern.reverseCodePointIterator();
+ // Skip to startPos in the target string.
+ for (int i = endPos; i < target.numChars(); ++i) {
+ if (targetIterator.hasNext()) {
+ targetIterator.next();
+ } else {
+ return MATCH_NOT_FOUND;
}
}
- return MATCH_NOT_FOUND;
+ // Compare the characters in the target and pattern strings.
+ int matchLength = 0, codePointBuffer = -1, targetCodePoint, patternCodePoint;
+ while ((targetIterator.hasNext() || codePointBuffer != -1) && patternIterator.hasNext()) {
+ if (codePointBuffer != -1) {
+ targetCodePoint = codePointBuffer;
+ codePointBuffer = -1;
+ } else {
+ // Use buffered lowercase code point iteration to handle one-to-many case mappings.
+ targetCodePoint = getLowercaseCodePoint(targetIterator.next());
+ if (targetCodePoint == COMBINED_ASCII_SMALL_I_COMBINING_DOT) {
+ targetCodePoint = SpecialCodePointConstants.COMBINING_DOT;
+ codePointBuffer = SpecialCodePointConstants.ASCII_SMALL_I;
+ }
+ ++matchLength;
+ }
+ patternCodePoint = patternIterator.next();
+ if (targetCodePoint != patternCodePoint) {
+ return MATCH_NOT_FOUND;
+ }
+ }
+ // If the pattern string has more characters, or the match is found at the middle of a
+ // character that maps to multiple characters in lowercase, then match is not found.
+ if (patternIterator.hasNext() || codePointBuffer != -1) {
+ return MATCH_NOT_FOUND;
+ }
+ // If all characters are equal, return the length of the match in the target string.
+ return matchLength;
}
/**
@@ -191,10 +264,9 @@ private static int lowercaseRFind(
}
/**
- * Lowercase UTF8String comparison used for UTF8_LCASE collation. While the default
- * UTF8String comparison is equivalent to a.toLowerCase().binaryCompare(b.toLowerCase()), this
- * method uses code points to compare the strings in a case-insensitive manner using ICU rules,
- * as well as handling special rules for one-to-many case mappings (see: lowerCaseCodePoints).
+ * Lowercase UTF8String comparison used for UTF8_LCASE collation. This method uses lowercased
+ * code points to compare the strings in a case-insensitive manner using ICU rules, taking into
+ * account special rules for one-to-many case mappings (see: lowerCaseCodePoints).
*
* @param left The first UTF8String to compare.
* @param right The second UTF8String to compare.
@@ -238,115 +310,82 @@ private static int compareLowerCaseSlow(final UTF8String left, final UTF8String
return lowerCaseCodePoints(left).binaryCompare(lowerCaseCodePoints(right));
}
- /*
+ /**
* Performs string replacement for ICU collations by searching for instances of the search
- * string in the `src` string, with respect to the specified collation, and then replacing
+ * string in the `target` string, with respect to the specified collation, and then replacing
* them with the replace string. The method returns a new UTF8String with all instances of the
* search string replaced using the replace string. Similar to UTF8String.findInSet behavior
- * used for UTF8_BINARY, the method returns the `src` string if the `search` string is empty.
+ * used for UTF8_BINARY, the method returns the `target` string if the `search` string is empty.
*
- * @param src the string to be searched in
+ * @param target the string to be searched in
* @param search the string to be searched for
* @param replace the string to be used as replacement
* @param collationId the collation ID to use for string search
* @return the position of the first occurrence of `match` in `set`
*/
- public static UTF8String replace(final UTF8String src, final UTF8String search,
+ public static UTF8String replace(final UTF8String target, final UTF8String search,
final UTF8String replace, final int collationId) {
// This collation aware implementation is based on existing implementation on UTF8String
- if (src.numBytes() == 0 || search.numBytes() == 0) {
- return src;
- }
-
- StringSearch stringSearch = CollationFactory.getStringSearch(src, search, collationId);
-
- // Find the first occurrence of the search string.
- int end = stringSearch.next();
- if (end == StringSearch.DONE) {
- // Search string was not found, so string is unchanged.
- return src;
+ if (target.numBytes() == 0 || search.numBytes() == 0) {
+ return target;
}
- // Initialize byte positions
- int c = 0;
- int byteStart = 0; // position in byte
- int byteEnd = 0; // position in byte
- while (byteEnd < src.numBytes() && c < end) {
- byteEnd += UTF8String.numBytesForFirstByte(src.getByte(byteEnd));
- c += 1;
- }
+ String targetStr = target.toValidString();
+ String searchStr = search.toValidString();
+ StringSearch stringSearch = CollationFactory.getStringSearch(targetStr, searchStr, collationId);
- // At least one match was found. Estimate space needed for result.
- // The 16x multiplier here is chosen to match commons-lang3's implementation.
- int increase = Math.max(0, Math.abs(replace.numBytes() - search.numBytes())) * 16;
- final UTF8StringBuilder buf = new UTF8StringBuilder(src.numBytes() + increase);
- while (end != StringSearch.DONE) {
- buf.appendBytes(src.getBaseObject(), src.getBaseOffset() + byteStart, byteEnd - byteStart);
- buf.append(replace);
-
- // Move byteStart to the beginning of the current match
- byteStart = byteEnd;
- int cs = c;
- // Move cs to the end of the current match
- // This is necessary because the search string may contain 'multi-character' characters
- while (byteStart < src.numBytes() && cs < c + stringSearch.getMatchLength()) {
- byteStart += UTF8String.numBytesForFirstByte(src.getByte(byteStart));
- cs += 1;
- }
- // Go to next match
- end = stringSearch.next();
- // Update byte positions
- while (byteEnd < src.numBytes() && c < end) {
- byteEnd += UTF8String.numBytesForFirstByte(src.getByte(byteEnd));
- c += 1;
- }
+ StringBuilder sb = new StringBuilder();
+ int start = 0;
+ int matchStart = stringSearch.first();
+ while (matchStart != StringSearch.DONE) {
+ sb.append(targetStr, start, matchStart);
+ sb.append(replace.toValidString());
+ start = matchStart + stringSearch.getMatchLength();
+ matchStart = stringSearch.next();
}
- buf.appendBytes(src.getBaseObject(), src.getBaseOffset() + byteStart,
- src.numBytes() - byteStart);
- return buf.build();
+ sb.append(targetStr, start, targetStr.length());
+ return UTF8String.fromString(sb.toString());
}
- /*
+ /**
* Performs string replacement for UTF8_LCASE collation by searching for instances of the search
- * string in the src string, with respect to lowercased string versions, and then replacing
+ * string in the target string, with respect to lowercased string versions, and then replacing
* them with the replace string. The method returns a new UTF8String with all instances of the
* search string replaced using the replace string. Similar to UTF8String.findInSet behavior
- * used for UTF8_BINARY, the method returns the `src` string if the `search` string is empty.
+ * used for UTF8_BINARY, the method returns the `target` string if the `search` string is empty.
*
- * @param src the string to be searched in
+ * @param target the string to be searched in
* @param search the string to be searched for
* @param replace the string to be used as replacement
- * @param collationId the collation ID to use for string search
* @return the position of the first occurrence of `match` in `set`
*/
- public static UTF8String lowercaseReplace(final UTF8String src, final UTF8String search,
+ public static UTF8String lowercaseReplace(final UTF8String target, final UTF8String search,
final UTF8String replace) {
- if (src.numBytes() == 0 || search.numBytes() == 0) {
- return src;
+ if (target.numBytes() == 0 || search.numBytes() == 0) {
+ return target;
}
- // TODO(SPARK-48725): Use lowerCaseCodePoints instead of UTF8String.toLowerCase.
- UTF8String lowercaseSearch = search.toLowerCase();
+ UTF8String lowercaseSearch = lowerCaseCodePoints(search);
int start = 0;
- int end = lowercaseFind(src, lowercaseSearch, start);
+ int end = lowercaseFind(target, lowercaseSearch, start);
if (end == -1) {
// Search string was not found, so string is unchanged.
- return src;
+ return target;
}
// At least one match was found. Estimate space needed for result.
// The 16x multiplier here is chosen to match commons-lang3's implementation.
int increase = Math.max(0, replace.numBytes() - search.numBytes()) * 16;
- final UTF8StringBuilder buf = new UTF8StringBuilder(src.numBytes() + increase);
+ final UTF8StringBuilder buf = new UTF8StringBuilder(target.numBytes() + increase);
while (end != -1) {
- buf.append(src.substring(start, end));
+ buf.append(target.substring(start, end));
buf.append(replace);
// Update character positions
- start = end + lowercaseMatchLengthFrom(src, lowercaseSearch, end);
- end = lowercaseFind(src, lowercaseSearch, start);
+ start = end + lowercaseMatchLengthFrom(target, lowercaseSearch, end);
+ end = lowercaseFind(target, lowercaseSearch, start);
}
- buf.append(src.substring(start, src.numChars()));
+ buf.append(target.substring(start, target.numChars()));
return buf.build();
}
@@ -433,28 +472,16 @@ private static UTF8String toLowerCaseSlow(final UTF8String target, final int col
*/
private static void appendLowercaseCodePoint(final int codePoint, final StringBuilder sb) {
int lowercaseCodePoint = getLowercaseCodePoint(codePoint);
- if (lowercaseCodePoint == CODE_POINT_COMBINED_LOWERCASE_I_DOT) {
+ if (lowercaseCodePoint == COMBINED_ASCII_SMALL_I_COMBINING_DOT) {
// Latin capital letter I with dot above is mapped to 2 lowercase characters.
- sb.appendCodePoint(0x0069);
- sb.appendCodePoint(0x0307);
+ sb.appendCodePoint(SpecialCodePointConstants.ASCII_SMALL_I);
+ sb.appendCodePoint(SpecialCodePointConstants.COMBINING_DOT);
} else {
// All other characters should follow context-unaware ICU single-code point case mapping.
sb.appendCodePoint(lowercaseCodePoint);
}
}
- /**
- * `CODE_POINT_COMBINED_LOWERCASE_I_DOT` is an internal representation of the combined lowercase
- * code point for ASCII lowercase letter i with an additional combining dot character (U+0307).
- * This integer value is not a valid code point itself, but rather an artificial code point
- * marker used to represent the two lowercase characters that are the result of converting the
- * uppercase Turkish dotted letter I with a combining dot character (U+0130) to lowercase.
- */
- private static final int CODE_POINT_LOWERCASE_I = 0x69;
- private static final int CODE_POINT_COMBINING_DOT = 0x307;
- private static final int CODE_POINT_COMBINED_LOWERCASE_I_DOT =
- CODE_POINT_LOWERCASE_I << 16 | CODE_POINT_COMBINING_DOT;
-
/**
* Returns the lowercase version of the provided code point, with special handling for
* one-to-many case mappings (i.e. characters that map to multiple characters in lowercase) and
@@ -462,15 +489,15 @@ private static void appendLowercaseCodePoint(final int codePoint, final StringBu
* the position in the string relative to other characters in lowercase).
*/
private static int getLowercaseCodePoint(final int codePoint) {
- if (codePoint == 0x0130) {
+ if (codePoint == SpecialCodePointConstants.CAPITAL_I_WITH_DOT_ABOVE) {
// Latin capital letter I with dot above is mapped to 2 lowercase characters.
- return CODE_POINT_COMBINED_LOWERCASE_I_DOT;
+ return COMBINED_ASCII_SMALL_I_COMBINING_DOT;
}
- else if (codePoint == 0x03C2) {
+ else if (codePoint == SpecialCodePointConstants.GREEK_FINAL_SIGMA) {
// Greek final and non-final letter sigma should be mapped the same. This is achieved by
// mapping Greek small final sigma (U+03C2) to Greek small non-final sigma (U+03C3). Capital
// letter sigma (U+03A3) is mapped to small non-final sigma (U+03C3) in the `else` branch.
- return 0x03C3;
+ return SpecialCodePointConstants.GREEK_SMALL_SIGMA;
}
else {
// All other characters should follow context-unaware ICU single-code point case mapping.
@@ -522,6 +549,152 @@ public static UTF8String toTitleCase(final UTF8String target, final int collatio
BreakIterator.getWordInstance(locale)));
}
+ /**
+ * This 'HashMap' is introduced as a performance speedup. Since title-casing a codepoint can
+ * result in more than a single codepoint, for correctness, we would use
+ * 'UCharacter.toTitleCase(String)' which returns a 'String'. If we use
+ * 'UCharacter.toTitleCase(int)' (the version of the same function which converts a single
+ * codepoint to its title-case codepoint), it would be faster than the previously mentioned
+ * version, but the problem here is that we don't handle when title-casing a codepoint yields more
+ * than 1 codepoint. Since there are only 48 codepoints that are mapped to more than 1 codepoint
+ * when title-cased, they are precalculated here, so that the faster function for title-casing
+ * could be used in combination with this 'HashMap' in the method 'appendCodepointToTitleCase'.
+ */
+ private static final HashMap codepointOneToManyTitleCaseLookupTable =
+ new HashMap<>(){{
+ StringBuilder sb = new StringBuilder();
+ for (int i = Character.MIN_CODE_POINT; i <= Character.MAX_CODE_POINT; ++i) {
+ sb.appendCodePoint(i);
+ String titleCase = UCharacter.toTitleCase(sb.toString(), null);
+ if (titleCase.codePointCount(0, titleCase.length()) > 1) {
+ put(i, titleCase);
+ }
+ sb.setLength(0);
+ }
+ }};
+
+ /**
+ * Title-casing a string using ICU case mappings. Iterates over the string and title-cases
+ * the first character in each word, and lowercases every other character. Handles lowercasing
+ * capital Greek letter sigma ('Σ') separately, taking into account if it should be a small final
+ * Greek sigma ('ς') or small non-final Greek sigma ('σ'). Words are separated by ASCII
+ * space(\u0020).
+ *
+ * @param source UTF8String to be title cased
+ * @return title cased source
+ */
+ public static UTF8String toTitleCaseICU(UTF8String source) {
+ // In the default UTF8String implementation, `toLowerCase` method implicitly does UTF8String
+ // validation (replacing invalid UTF-8 byte sequences with Unicode replacement character
+ // U+FFFD), but now we have to do the validation manually.
+ source = source.makeValid();
+
+ // Building the title cased source with 'sb'.
+ UTF8StringBuilder sb = new UTF8StringBuilder();
+
+ // 'isNewWord' is true if the current character is the beginning of a word, false otherwise.
+ boolean isNewWord = true;
+ // We are maintaining if the current character is preceded by a cased letter.
+ // This is used when lowercasing capital Greek letter sigma ('Σ'), to figure out if it should be
+ // lowercased into σ or ς.
+ boolean precededByCasedLetter = false;
+
+ // 'offset' is a byte offset in source's byte array pointing to the beginning of the character
+ // that we need to process next.
+ int offset = 0;
+ int len = source.numBytes();
+
+ while (offset < len) {
+ // We will actually call 'codePointFrom()' 2 times for each character in the worst case (once
+ // here, and once in 'followedByCasedLetter'). Example of a string where we call it 2 times
+ // for almost every character is 'ΣΣΣΣΣ' (a string consisting only of Greek capital sigma)
+ // and 'Σ`````' (a string consisting of a Greek capital sigma, followed by case-ignorable
+ // characters).
+ int codepoint = source.codePointFrom(offset);
+ // Appending the correctly cased character onto 'sb'.
+ appendTitleCasedCodepoint(sb, codepoint, isNewWord, precededByCasedLetter, source, offset);
+ // Updating 'isNewWord', 'precededByCasedLetter' and 'offset' to be ready for the next
+ // character that we will process.
+ isNewWord = (codepoint == SpecialCodePointConstants.ASCII_SPACE);
+ if (!UCharacter.hasBinaryProperty(codepoint, UProperty.CASE_IGNORABLE)) {
+ precededByCasedLetter = UCharacter.hasBinaryProperty(codepoint, UProperty.CASED);
+ }
+ offset += UTF8String.numBytesForFirstByte(source.getByte(offset));
+ }
+ return sb.build();
+ }
+
+ private static void appendTitleCasedCodepoint(
+ UTF8StringBuilder sb,
+ int codepoint,
+ boolean isAfterAsciiSpace,
+ boolean precededByCasedLetter,
+ UTF8String source,
+ int offset) {
+ if (isAfterAsciiSpace) {
+ // Title-casing a character if it is in the beginning of a new word.
+ appendCodepointToTitleCase(sb, codepoint);
+ return;
+ }
+ if (codepoint == SpecialCodePointConstants.GREEK_CAPITAL_SIGMA) {
+ // Handling capital Greek letter sigma ('Σ').
+ appendLowerCasedGreekCapitalSigma(sb, precededByCasedLetter, source, offset);
+ return;
+ }
+ // If it's not the beginning of a word, or a capital Greek letter sigma ('Σ'), we lowercase the
+ // character. We specially handle 'CAPITAL_I_WITH_DOT_ABOVE'.
+ if (codepoint == SpecialCodePointConstants.CAPITAL_I_WITH_DOT_ABOVE) {
+ sb.appendCodePoint(SpecialCodePointConstants.ASCII_SMALL_I);
+ sb.appendCodePoint(SpecialCodePointConstants.COMBINING_DOT);
+ return;
+ }
+ sb.appendCodePoint(UCharacter.toLowerCase(codepoint));
+ }
+
+ private static void appendLowerCasedGreekCapitalSigma(
+ UTF8StringBuilder sb,
+ boolean precededByCasedLetter,
+ UTF8String source,
+ int offset) {
+ int codepoint = (!followedByCasedLetter(source, offset) && precededByCasedLetter)
+ ? SpecialCodePointConstants.GREEK_FINAL_SIGMA
+ : SpecialCodePointConstants.GREEK_SMALL_SIGMA;
+ sb.appendCodePoint(codepoint);
+ }
+
+ /**
+ * Checks if the character beginning at 'offset'(in 'sources' byte array) is followed by a cased
+ * letter.
+ */
+ private static boolean followedByCasedLetter(UTF8String source, int offset) {
+ // Moving the offset one character forward, so we could start the linear search from there.
+ offset += UTF8String.numBytesForFirstByte(source.getByte(offset));
+ int len = source.numBytes();
+
+ while (offset < len) {
+ int codepoint = source.codePointFrom(offset);
+
+ if (UCharacter.hasBinaryProperty(codepoint, UProperty.CASE_IGNORABLE)) {
+ offset += UTF8String.numBytesForFirstByte(source.getByte(offset));
+ continue;
+ }
+ return UCharacter.hasBinaryProperty(codepoint, UProperty.CASED);
+ }
+ return false;
+ }
+
+ /**
+ * Appends title-case of a single character to a 'StringBuilder' using the ICU root locale rules.
+ */
+ private static void appendCodepointToTitleCase(UTF8StringBuilder sb, int codepoint) {
+ String toTitleCase = codepointOneToManyTitleCaseLookupTable.get(codepoint);
+ if (toTitleCase == null) {
+ sb.appendCodePoint(UCharacter.toTitleCase(codepoint));
+ } else {
+ sb.append(toTitleCase);
+ }
+ }
+
/*
* Returns the position of the first occurrence of the match string in the set string,
* counting ASCII commas as delimiters. The match string is compared in a collation-aware manner,
@@ -559,6 +732,58 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co
return 0;
}
+ /**
+ * Checks whether the target string contains the pattern string, with respect to the UTF8_LCASE
+ * collation. This method generally works with respect to code-point based comparison logic.
+ *
+ * @param target the string to be searched in
+ * @param pattern the string to be searched for
+ * @return whether the target string contains the pattern string
+ */
+ public static boolean lowercaseContains(final UTF8String target, final UTF8String pattern) {
+ // Fast path for ASCII-only strings.
+ if (target.isFullAscii() && pattern.isFullAscii()) {
+ return target.toLowerCase().contains(pattern.toLowerCase());
+ }
+ // Slow path for non-ASCII strings.
+ return CollationAwareUTF8String.lowercaseIndexOfSlow(target, pattern, 0) >= 0;
+ }
+
+ /**
+ * Checks whether the target string starts with the pattern string, with respect to the UTF8_LCASE
+ * collation. This method generally works with respect to code-point based comparison logic.
+ *
+ * @param target the string to be searched in
+ * @param pattern the string to be searched for
+ * @return whether the target string starts with the pattern string
+ */
+ public static boolean lowercaseStartsWith(final UTF8String target, final UTF8String pattern) {
+ // Fast path for ASCII-only strings.
+ if (target.isFullAscii() && pattern.isFullAscii()) {
+ return target.toLowerCase().startsWith(pattern.toLowerCase());
+ }
+ // Slow path for non-ASCII strings.
+ return CollationAwareUTF8String.lowercaseMatchFrom(target, lowerCaseCodePointsSlow(pattern), 0);
+ }
+
+ /**
+ * Checks whether the target string ends with the pattern string, with respect to the UTF8_LCASE
+ * collation. This method generally works with respect to code-point based comparison logic.
+ *
+ * @param target the string to be searched in
+ * @param pattern the string to be searched for
+ * @return whether the target string ends with the pattern string
+ */
+ public static boolean lowercaseEndsWith(final UTF8String target, final UTF8String pattern) {
+ // Fast path for ASCII-only strings.
+ if (target.isFullAscii() && pattern.isFullAscii()) {
+ return target.toLowerCase().endsWith(pattern.toLowerCase());
+ }
+ // Slow path for non-ASCII strings.
+ return CollationAwareUTF8String.lowercaseMatchUntil(target, lowerCaseCodePointsSlow(pattern),
+ target.numChars());
+ }
+
/**
* Returns the position of the first occurrence of the pattern string in the target string,
* starting from the specified position (0-based index referring to character position in
@@ -573,30 +798,76 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co
public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern,
final int start) {
if (pattern.numChars() == 0) return target.indexOfEmpty(start);
- return lowercaseFind(target, pattern.toLowerCase(), start);
+ if (target.isFullAscii() && pattern.isFullAscii()) {
+ return target.toLowerCase().indexOf(pattern.toLowerCase(), start);
+ }
+ return lowercaseIndexOfSlow(target, pattern, start);
+ }
+
+ private static int lowercaseIndexOfSlow(final UTF8String target, final UTF8String pattern,
+ final int start) {
+ return lowercaseFind(target, lowerCaseCodePoints(pattern), start);
}
public static int indexOf(final UTF8String target, final UTF8String pattern,
final int start, final int collationId) {
if (pattern.numBytes() == 0) return target.indexOfEmpty(start);
if (target.numBytes() == 0) return MATCH_NOT_FOUND;
-
- StringSearch stringSearch = CollationFactory.getStringSearch(target, pattern, collationId);
- stringSearch.setIndex(start);
-
- return stringSearch.next();
+ // Initialize the string search with respect to the specified ICU collation.
+ String targetStr = target.toValidString();
+ String patternStr = pattern.toValidString();
+ // Check if `start` is out of bounds. The provided offset `start` is given in number of
+ // codepoints, so a simple `targetStr.length` check is not sufficient here. This check is
+ // needed because `String.offsetByCodePoints` throws an `IndexOutOfBoundsException`
+ // exception when the offset is out of bounds.
+ if (targetStr.codePointCount(0, targetStr.length()) <= start) return MATCH_NOT_FOUND;
+ StringSearch stringSearch =
+ CollationFactory.getStringSearch(targetStr, patternStr, collationId);
+ stringSearch.setOverlapping(true);
+ // Start the search from `start`-th code point (NOT necessarily from the `start`-th character).
+ int startIndex = targetStr.offsetByCodePoints(0, start);
+ stringSearch.setIndex(startIndex);
+ // Perform the search and return the next result, starting from the specified position.
+ int searchIndex = stringSearch.next();
+ if (searchIndex == StringSearch.DONE) {
+ return MATCH_NOT_FOUND;
+ }
+ // Convert the search index from character count to code point count.
+ int indexOf = targetStr.codePointCount(0, searchIndex);
+ if (indexOf < start) {
+ return MATCH_NOT_FOUND;
+ }
+ return indexOf;
}
- public static int find(UTF8String target, UTF8String pattern, int start,
- int collationId) {
- assert (pattern.numBytes() > 0);
-
- StringSearch stringSearch = CollationFactory.getStringSearch(target, pattern, collationId);
- // Set search start position (start from character at start position)
- stringSearch.setIndex(target.bytePosToChar(start));
+ private static int findIndex(final StringSearch stringSearch, int count) {
+ assert(count >= 0);
+ int index = 0;
+ while (count > 0) {
+ int nextIndex = stringSearch.next();
+ if (nextIndex == StringSearch.DONE) {
+ return MATCH_NOT_FOUND;
+ } else if (nextIndex == index && index != 0) {
+ stringSearch.setIndex(stringSearch.getIndex() + stringSearch.getMatchLength());
+ } else {
+ count--;
+ index = nextIndex;
+ }
+ }
+ return index;
+ }
- // Return either the byte position or -1 if not found
- return target.charPosToByte(stringSearch.next());
+ private static int findIndexReverse(final StringSearch stringSearch, int count) {
+ assert(count >= 0);
+ int index = 0;
+ while (count > 0) {
+ index = stringSearch.previous();
+ if (index == StringSearch.DONE) {
+ return MATCH_NOT_FOUND;
+ }
+ count--;
+ }
+ return index + stringSearch.getMatchLength();
}
public static UTF8String subStringIndex(final UTF8String string, final UTF8String delimiter,
@@ -604,63 +875,30 @@ public static UTF8String subStringIndex(final UTF8String string, final UTF8Strin
if (delimiter.numBytes() == 0 || count == 0 || string.numBytes() == 0) {
return UTF8String.EMPTY_UTF8;
}
+ String str = string.toValidString();
+ String delim = delimiter.toValidString();
+ StringSearch stringSearch = CollationFactory.getStringSearch(str, delim, collationId);
+ stringSearch.setOverlapping(true);
if (count > 0) {
- int idx = -1;
- while (count > 0) {
- idx = find(string, delimiter, idx + 1, collationId);
- if (idx >= 0) {
- count --;
- } else {
- // can not find enough delim
- return string;
- }
- }
- if (idx == 0) {
+ // If the count is positive, we search for the count-th delimiter from the left.
+ int searchIndex = findIndex(stringSearch, count);
+ if (searchIndex == MATCH_NOT_FOUND) {
+ return string;
+ } else if (searchIndex == 0) {
return UTF8String.EMPTY_UTF8;
+ } else {
+ return UTF8String.fromString(str.substring(0, searchIndex));
}
- byte[] bytes = new byte[idx];
- copyMemory(string.getBaseObject(), string.getBaseOffset(), bytes, BYTE_ARRAY_OFFSET, idx);
- return UTF8String.fromBytes(bytes);
-
} else {
- count = -count;
-
- StringSearch stringSearch = CollationFactory
- .getStringSearch(string, delimiter, collationId);
-
- int start = string.numChars() - 1;
- int lastMatchLength = 0;
- int prevStart = -1;
- while (count > 0) {
- stringSearch.reset();
- prevStart = -1;
- int matchStart = stringSearch.next();
- lastMatchLength = stringSearch.getMatchLength();
- while (matchStart <= start) {
- if (matchStart != StringSearch.DONE) {
- // Found a match, update the start position
- prevStart = matchStart;
- matchStart = stringSearch.next();
- } else {
- break;
- }
- }
-
- if (prevStart == -1) {
- // can not find enough delim
+ // If the count is negative, we search for the count-th delimiter from the right.
+ int searchIndex = findIndexReverse(stringSearch, -count);
+ if (searchIndex == MATCH_NOT_FOUND) {
return string;
- } else {
- start = prevStart - 1;
- count--;
- }
- }
-
- int resultStart = prevStart + lastMatchLength;
- if (resultStart == string.numChars()) {
- return UTF8String.EMPTY_UTF8;
+ } else if (searchIndex == str.length()) {
+ return UTF8String.EMPTY_UTF8;
+ } else {
+ return UTF8String.fromString(str.substring(searchIndex));
}
-
- return string.substring(resultStart, string.numChars());
}
}
@@ -670,7 +908,7 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string,
return UTF8String.EMPTY_UTF8;
}
- UTF8String lowercaseDelimiter = delimiter.toLowerCase();
+ UTF8String lowercaseDelimiter = lowerCaseCodePoints(delimiter);
if (count > 0) {
// Search left to right (note: the start code point is inclusive).
@@ -750,11 +988,11 @@ public static UTF8String lowercaseTranslate(final UTF8String input,
}
// Special handling for letter i (U+0069) followed by a combining dot (U+0307). By ensuring
// that `CODE_POINT_LOWERCASE_I` is buffered, we guarantee finding a max-length match.
- if (lowercaseDict.containsKey(CODE_POINT_COMBINED_LOWERCASE_I_DOT) &&
- codePoint == CODE_POINT_LOWERCASE_I && inputIter.hasNext()) {
+ if (lowercaseDict.containsKey(COMBINED_ASCII_SMALL_I_COMBINING_DOT)
+ && codePoint == SpecialCodePointConstants.ASCII_SMALL_I && inputIter.hasNext()) {
int nextCodePoint = inputIter.next();
- if (nextCodePoint == CODE_POINT_COMBINING_DOT) {
- codePoint = CODE_POINT_COMBINED_LOWERCASE_I_DOT;
+ if (nextCodePoint == SpecialCodePointConstants.COMBINING_DOT) {
+ codePoint = COMBINED_ASCII_SMALL_I_COMBINING_DOT;
} else {
codePointBuffer = nextCodePoint;
}
@@ -842,6 +1080,24 @@ public static UTF8String translate(final UTF8String input,
return UTF8String.fromString(sb.toString());
}
+ /**
+ * Trims the `srcString` string from both ends of the string using the specified `trimString`
+ * characters, with respect to the UTF8_BINARY trim collation. String trimming is performed by
+ * first trimming the left side of the string, and then trimming the right side of the string.
+ * The method returns the trimmed string. If the `trimString` is null, the method returns null.
+ *
+ * @param srcString the input string to be trimmed from both ends of the string
+ * @param trimString the trim string characters to trim
+ * @param collationId the collation ID to use for string trim
+ * @return the trimmed string (for UTF8_BINARY collation)
+ */
+ public static UTF8String binaryTrim(
+ final UTF8String srcString,
+ final UTF8String trimString,
+ final int collationId) {
+ return binaryTrimRight(srcString.trimLeft(trimString), trimString, collationId);
+ }
+
/**
* Trims the `srcString` string from both ends of the string using the specified `trimString`
* characters, with respect to the UTF8_LCASE collation. String trimming is performed by
@@ -850,12 +1106,14 @@ public static UTF8String translate(final UTF8String input,
*
* @param srcString the input string to be trimmed from both ends of the string
* @param trimString the trim string characters to trim
+ * @param collationId the collation ID to use for string trim
* @return the trimmed string (for UTF8_LCASE collation)
*/
public static UTF8String lowercaseTrim(
final UTF8String srcString,
- final UTF8String trimString) {
- return lowercaseTrimRight(lowercaseTrimLeft(srcString, trimString), trimString);
+ final UTF8String trimString,
+ final int collationId) {
+ return lowercaseTrimRight(lowercaseTrimLeft(srcString, trimString), trimString, collationId);
}
/**
@@ -883,7 +1141,8 @@ public static UTF8String trim(
* the left side, until reaching a character whose lowercased code point is not in the hash set.
* Finally, the method returns the substring from that position to the end of `srcString`.
* If `trimString` is null, null is returned. If `trimString` is empty, `srcString` is returned.
- *
+ * Note: as currently only trimming collation supported is RTRIM, trimLeft is not modified
+ * to support other trim collations, this should be done in case of adding TRIM and LTRIM.
* @param srcString the input string to be trimmed from the left end of the string
* @param trimString the trim string characters to trim
* @return the trimmed string (for UTF8_LCASE collation)
@@ -902,20 +1161,29 @@ public static UTF8String lowercaseTrimLeft(
while (trimIter.hasNext()) trimChars.add(getLowercaseCodePoint(trimIter.next()));
// Iterate over `srcString` from the left to find the first character that is not in the set.
- int searchIndex = 0, codePoint;
+ int searchIndex = 0, codePoint, codePointBuffer = -1;
Iterator srcIter = srcString.codePointIterator();
while (srcIter.hasNext()) {
- codePoint = getLowercaseCodePoint(srcIter.next());
+ // Get the next code point from either the buffer or the iterator.
+ if (codePointBuffer != -1) {
+ codePoint = codePointBuffer;
+ codePointBuffer = -1;
+ }
+ else {
+ codePoint = getLowercaseCodePoint(srcIter.next());
+ }
// Special handling for Turkish dotted uppercase letter I.
- if (codePoint == CODE_POINT_LOWERCASE_I && srcIter.hasNext() &&
- trimChars.contains(CODE_POINT_COMBINED_LOWERCASE_I_DOT)) {
- int nextCodePoint = getLowercaseCodePoint(srcIter.next());
- if ((trimChars.contains(codePoint) && trimChars.contains(nextCodePoint))
- || nextCodePoint == CODE_POINT_COMBINING_DOT) {
+ if (codePoint == SpecialCodePointConstants.ASCII_SMALL_I && srcIter.hasNext() &&
+ trimChars.contains(COMBINED_ASCII_SMALL_I_COMBINING_DOT)) {
+ codePointBuffer = codePoint;
+ codePoint = getLowercaseCodePoint(srcIter.next());
+ if (codePoint == SpecialCodePointConstants.COMBINING_DOT) {
searchIndex += 2;
- }
- else {
- if (trimChars.contains(codePoint)) ++searchIndex;
+ codePointBuffer = -1;
+ } else if (trimChars.contains(codePointBuffer)) {
+ ++searchIndex;
+ codePointBuffer = codePoint;
+ } else {
break;
}
} else if (trimChars.contains(codePoint)) {
@@ -937,7 +1205,9 @@ public static UTF8String lowercaseTrimLeft(
* character in `trimString`, until reaching a character that is not found in `trimString`.
* Finally, the method returns the substring from that position to the end of `srcString`.
* If `trimString` is null, null is returned. If `trimString` is empty, `srcString` is returned.
- *
+ * Note: as currently only trimming collation supported is RTRIM, trimLeft is not modified
+ * to support other trim collations, this should be done in case of adding TRIM and LTRIM
+ * collation.
* @param srcString the input string to be trimmed from the left end of the string
* @param trimString the trim string characters to trim
* @param collationId the collation ID to use for string trimming
@@ -957,7 +1227,7 @@ public static UTF8String trimLeft(
CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
while (trimIter.hasNext()) {
int codePoint = trimIter.next();
- trimChars.putIfAbsent(codePoint, String.valueOf((char) codePoint));
+ trimChars.putIfAbsent(codePoint, new String(Character.toChars(codePoint)));
}
// Iterate over srcString from the left and find the first character that is not in trimChars.
@@ -985,22 +1255,103 @@ public static UTF8String trimLeft(
// Return the substring from the calculated position until the end of the string.
return UTF8String.fromString(src.substring(charIndex));
}
+ /**
+ * Trims the `srcString` string from the right side using the specified `trimString` characters,
+ * with respect to the UTF8_BINARY trim collation. For UTF8_BINARY trim collation, the method has
+ * one special case to cover with respect to trimRight function for regular UTF8_Binary collation.
+ * Trailing spaces should be ignored in case of trim collation (rtrim for example) and if
+ * trimString does not contain spaces. In this case, the method trims the string from the right
+ * and after call of regular trim functions returns back trimmed spaces as those should not get
+ * removed.
+ * @param srcString the input string to be trimmed from the right end of the string
+ * @param trimString the trim string characters to trim
+ * @param collationId the collation ID to use for string trim
+ * @return the trimmed string (for UTF_BINARY collation)
+ */
+ public static UTF8String binaryTrimRight(
+ final UTF8String srcString,
+ final UTF8String trimString,
+ final int collationId) {
+ // Matching the default UTF8String behavior for null `trimString`.
+ if (trimString == null) {
+ return null;
+ }
+
+ // Create a hash set of code points for all characters of `trimString`.
+ HashSet trimChars = new HashSet<>();
+ Iterator trimIter = trimString.codePointIterator();
+ while (trimIter.hasNext()) trimChars.add(trimIter.next());
+
+ // Iterate over `srcString` from the right to find the first character that is not in the set.
+ int searchIndex = srcString.numChars(), codePoint, codePointBuffer = -1;
+
+ // In cases of trim collation (rtrim for example) trailing spaces should be ignored.
+ // If trimString contains spaces this behaviour is not important as they would get trimmed
+ // anyway. However, if it is not the case they should be ignored and then appended after
+ // trimming other characters.
+ int lastNonSpaceByteIdx = srcString.numBytes(), lastNonSpaceCharacterIdx = srcString.numChars();
+ if (!trimChars.contains(SpecialCodePointConstants.ASCII_SPACE) &&
+ CollationFactory.ignoresSpacesInTrimFunctions(
+ collationId, /*isLTrim=*/ false, /*isRTrim=*/true)) {
+ while (lastNonSpaceByteIdx > 0 &&
+ srcString.getByte(lastNonSpaceByteIdx - 1) == ' ') {
+ --lastNonSpaceByteIdx;
+ }
+ // In case of src string contains only spaces there is no need to do any trimming, since it's
+ // already checked that trim string does not contain any spaces.
+ if (lastNonSpaceByteIdx == 0) {
+ return srcString;
+ }
+ searchIndex = lastNonSpaceCharacterIdx =
+ srcString.numChars() - (srcString.numBytes() - lastNonSpaceByteIdx);
+ }
+ Iterator srcIter = srcString.reverseCodePointIterator();
+ for (int i = lastNonSpaceCharacterIdx; i < srcString.numChars(); i++) {
+ srcIter.next();
+ }
+
+ while (srcIter.hasNext()) {
+ codePoint = srcIter.next();
+ if (trimChars.contains(codePoint)) {
+ --searchIndex;
+ }
+ else {
+ break;
+ }
+ }
+
+ // Return the substring from the start of the string to the calculated position and append
+ // trailing spaces if they were ignored
+ if (searchIndex == srcString.numChars()) {
+ return srcString;
+ }
+ if (lastNonSpaceCharacterIdx == srcString.numChars()) {
+ return srcString.substring(0, searchIndex);
+ }
+ return UTF8String.concat(
+ srcString.substring(0, searchIndex),
+ srcString.substring(lastNonSpaceCharacterIdx, srcString.numChars()));
+ }
/**
* Trims the `srcString` string from the right side using the specified `trimString` characters,
* with respect to the UTF8_LCASE collation. For UTF8_LCASE, the method first creates a hash
* set of lowercased code points in `trimString`, and then iterates over the `srcString` from
* the right side, until reaching a character whose lowercased code point is not in the hash set.
+ * In case of UTF8_LCASE trim collation and when trimString does not contain spaces, trailing
+ * spaces should be ignored. However, after trimming function call they should be appended back.
* Finally, the method returns the substring from the start of `srcString` until that position.
* If `trimString` is null, null is returned. If `trimString` is empty, `srcString` is returned.
*
* @param srcString the input string to be trimmed from the right end of the string
* @param trimString the trim string characters to trim
+ * @param collationId the collation ID to use for string trim
* @return the trimmed string (for UTF8_LCASE collation)
*/
public static UTF8String lowercaseTrimRight(
final UTF8String srcString,
- final UTF8String trimString) {
+ final UTF8String trimString,
+ final int collationId) {
// Matching the default UTF8String behavior for null `trimString`.
if (trimString == null) {
return null;
@@ -1012,20 +1363,53 @@ public static UTF8String lowercaseTrimRight(
while (trimIter.hasNext()) trimChars.add(getLowercaseCodePoint(trimIter.next()));
// Iterate over `srcString` from the right to find the first character that is not in the set.
- int searchIndex = srcString.numChars(), codePoint;
+ int searchIndex = srcString.numChars(), codePoint, codePointBuffer = -1;
+
+ // In cases of trim collation (rtrim for example) trailing spaces should be ignored.
+ // If trimString contains spaces this behaviour is not important as they would get trimmed
+ // anyway. However, if it is not the case they should be ignored and then appended after
+ // trimming other characters.
+ int lastNonSpaceByteIdx = srcString.numBytes(), lastNonSpaceCharacterIdx = srcString.numChars();
+ if (!trimChars.contains(SpecialCodePointConstants.ASCII_SPACE) &&
+ CollationFactory.ignoresSpacesInTrimFunctions(
+ collationId, /*isLTrim=*/ false, /*isRTrim=*/true)) {
+ while (lastNonSpaceByteIdx > 0 &&
+ srcString.getByte(lastNonSpaceByteIdx - 1) == ' ') {
+ --lastNonSpaceByteIdx;
+ }
+ // In case of src string contains only spaces there is no need to do any trimming, since it's
+ // already checked that trim string does not contain any spaces.
+ if (lastNonSpaceByteIdx == 0) {
+ return srcString;
+ }
+ searchIndex = lastNonSpaceCharacterIdx =
+ srcString.numChars() - (srcString.numBytes() - lastNonSpaceByteIdx);
+ }
Iterator srcIter = srcString.reverseCodePointIterator();
+ for (int i = lastNonSpaceCharacterIdx; i < srcString.numChars(); i++) {
+ srcIter.next();
+ }
+
while (srcIter.hasNext()) {
- codePoint = getLowercaseCodePoint(srcIter.next());
+ if (codePointBuffer != -1) {
+ codePoint = codePointBuffer;
+ codePointBuffer = -1;
+ }
+ else {
+ codePoint = getLowercaseCodePoint(srcIter.next());
+ }
// Special handling for Turkish dotted uppercase letter I.
- if (codePoint == CODE_POINT_COMBINING_DOT && srcIter.hasNext() &&
- trimChars.contains(CODE_POINT_COMBINED_LOWERCASE_I_DOT)) {
- int nextCodePoint = getLowercaseCodePoint(srcIter.next());
- if ((trimChars.contains(codePoint) && trimChars.contains(nextCodePoint))
- || nextCodePoint == CODE_POINT_LOWERCASE_I) {
+ if (codePoint == SpecialCodePointConstants.COMBINING_DOT && srcIter.hasNext() &&
+ trimChars.contains(COMBINED_ASCII_SMALL_I_COMBINING_DOT)) {
+ codePointBuffer = codePoint;
+ codePoint = getLowercaseCodePoint(srcIter.next());
+ if (codePoint == SpecialCodePointConstants.ASCII_SMALL_I) {
searchIndex -= 2;
- }
- else {
- if (trimChars.contains(codePoint)) --searchIndex;
+ codePointBuffer = -1;
+ } else if (trimChars.contains(codePointBuffer)) {
+ --searchIndex;
+ codePointBuffer = codePoint;
+ } else {
break;
}
} else if (trimChars.contains(codePoint)) {
@@ -1036,8 +1420,17 @@ public static UTF8String lowercaseTrimRight(
}
}
- // Return the substring from the start of the string to the calculated position.
- return searchIndex == srcString.numChars() ? srcString : srcString.substring(0, searchIndex);
+ // Return the substring from the start of the string to the calculated position and append
+ // trailing spaces if they were ignored
+ if (searchIndex == srcString.numChars()) {
+ return srcString;
+ }
+ if (lastNonSpaceCharacterIdx == srcString.numChars()) {
+ return srcString.substring(0, searchIndex);
+ }
+ return UTF8String.concat(
+ srcString.substring(0, searchIndex),
+ srcString.substring(lastNonSpaceCharacterIdx, srcString.numChars()));
}
/**
@@ -1067,14 +1460,33 @@ public static UTF8String trimRight(
CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
while (trimIter.hasNext()) {
int codePoint = trimIter.next();
- trimChars.putIfAbsent(codePoint, String.valueOf((char) codePoint));
+ trimChars.putIfAbsent(codePoint, new String(Character.toChars(codePoint)));
}
// Iterate over srcString from the left and find the first character that is not in trimChars.
String src = srcString.toValidString();
CharacterIterator target = new StringCharacterIterator(src);
Collator collator = CollationFactory.fetchCollation(collationId).collator;
- int charIndex = src.length(), longestMatchLen;
+ int charIndex = src.length(), longestMatchLen, lastNonSpacePosition = src.length();
+
+ // In cases of trim collation (rtrim for example) trailing spaces should be ignored.
+ // If trimString contains spaces this behaviour is not important as they would get trimmed
+ // anyway. However, if it is not the case they should be ignored and then appended after
+ // trimming other characters.
+ if (!trimChars.containsKey(SpecialCodePointConstants.ASCII_SPACE) &&
+ CollationFactory.ignoresSpacesInTrimFunctions(
+ collationId, /*isLTrim=*/ false, /*isRTrim=*/true)) {
+ while (lastNonSpacePosition > 0 && src.charAt(lastNonSpacePosition - 1) == ' ') {
+ --lastNonSpacePosition;
+ }
+ // In case of src string contains only spaces there is no need to do any trimming, since it's
+ // already checked that trim string does not contain any spaces.
+ if (lastNonSpacePosition == 0) {
+ return UTF8String.fromString(src);
+ }
+ charIndex = lastNonSpacePosition;
+ }
+
while (charIndex >= 0) {
longestMatchLen = 0;
for (String trim : trimChars.values()) {
@@ -1102,8 +1514,91 @@ public static UTF8String trimRight(
else charIndex -= longestMatchLen;
}
- // Return the substring from the start of the string until that position.
- return UTF8String.fromString(src.substring(0, charIndex));
+ // Return the substring from the start of the string until that position and append
+ // trailing spaces if they were ignored
+ if (charIndex == src.length()) {
+ return srcString;
+ }
+ if (lastNonSpacePosition == srcString.numChars()) {
+ return UTF8String.fromString(src.substring(0, charIndex));
+ }
+ return UTF8String.fromString(
+ src.substring(0, charIndex) +
+ src.substring(lastNonSpacePosition)
+ );
+ }
+
+ public static UTF8String[] splitSQL(final UTF8String input, final UTF8String delim,
+ final int limit, final int collationId) {
+ if (CollationFactory.fetchCollation(collationId).isUtf8BinaryType) {
+ return input.split(delim, limit);
+ } else if (CollationFactory.fetchCollation(collationId).isUtf8LcaseType) {
+ return lowercaseSplitSQL(input, delim, limit);
+ } else {
+ return icuSplitSQL(input, delim, limit, collationId);
+ }
+ }
+
+ public static UTF8String[] lowercaseSplitSQL(final UTF8String string, final UTF8String delimiter,
+ final int limit) {
+ if (delimiter.numBytes() == 0) return new UTF8String[] { string };
+ if (string.numBytes() == 0) return new UTF8String[] { UTF8String.EMPTY_UTF8 };
+
+ List strings = new ArrayList<>();
+ UTF8String lowercaseDelimiter = lowerCaseCodePoints(delimiter);
+ int startIndex = 0, nextMatch = 0, nextMatchLength;
+ while (nextMatch != MATCH_NOT_FOUND) {
+ if (limit > 0 && strings.size() == limit - 1) {
+ break;
+ }
+ nextMatch = lowercaseFind(string, lowercaseDelimiter, startIndex);
+ if (nextMatch != MATCH_NOT_FOUND) {
+ nextMatchLength = lowercaseMatchLengthFrom(string, lowercaseDelimiter, nextMatch);
+ strings.add(string.substring(startIndex, nextMatch));
+ startIndex = nextMatch + nextMatchLength;
+ }
+ }
+ if (startIndex <= string.numChars()) {
+ strings.add(string.substring(startIndex, string.numChars()));
+ }
+ if (limit == 0) {
+ // Remove trailing empty strings
+ int i = strings.size() - 1;
+ while (i >= 0 && strings.get(i).numBytes() == 0) {
+ strings.remove(i);
+ i--;
+ }
+ }
+ return strings.toArray(new UTF8String[0]);
+ }
+
+ public static UTF8String[] icuSplitSQL(final UTF8String string, final UTF8String delimiter,
+ final int limit, final int collationId) {
+ if (delimiter.numBytes() == 0) return new UTF8String[] { string };
+ if (string.numBytes() == 0) return new UTF8String[] { UTF8String.EMPTY_UTF8 };
+ List strings = new ArrayList<>();
+ String target = string.toValidString(), pattern = delimiter.toValidString();
+ StringSearch stringSearch = CollationFactory.getStringSearch(target, pattern, collationId);
+ int start = 0, end;
+ while ((end = stringSearch.next()) != StringSearch.DONE) {
+ if (limit > 0 && strings.size() == limit - 1) {
+ break;
+ }
+ strings.add(UTF8String.fromString(target.substring(start, end)));
+ start = end + stringSearch.getMatchLength();
+ }
+ if (start <= target.length()) {
+ strings.add(UTF8String.fromString(target.substring(start)));
+ }
+ if (limit == 0) {
+ // Remove trailing empty strings
+ int i = strings.size() - 1;
+ while (i >= 0 && strings.get(i).numBytes() == 0) {
+ strings.remove(i);
+ i--;
+ }
+ }
+ return strings.toArray(new UTF8String[0]);
}
// TODO: Add more collation-aware UTF8String operations here.
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
index f13f66e384e0f..4064f830e92d8 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
@@ -23,12 +23,14 @@
import java.util.function.Function;
import java.util.function.BiFunction;
import java.util.function.ToLongFunction;
+import java.util.stream.Stream;
+import com.ibm.icu.text.CollationKey;
+import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.text.StringSearch;
import com.ibm.icu.util.ULocale;
-import com.ibm.icu.text.CollationKey;
-import com.ibm.icu.text.Collator;
+import com.ibm.icu.util.VersionInfo;
import org.apache.spark.SparkException;
import org.apache.spark.unsafe.types.UTF8String;
@@ -88,6 +90,18 @@ public Optional getVersion() {
}
}
+ public record CollationMeta(
+ String catalog,
+ String schema,
+ String collationName,
+ String language,
+ String country,
+ String icuVersion,
+ String padAttribute,
+ boolean accentSensitivity,
+ boolean caseSensitivity,
+ String spaceTrimming) { }
+
/**
* Entry encapsulating all information about a collation.
*/
@@ -99,7 +113,8 @@ public static class Collation {
/**
* Version of the collation. This is the version of the ICU library Collator.
- * For non-ICU collations (e.g. UTF8 Binary) the version is set to "1.0".
+ * For UTF8 Binary the version is set to "1.0". For ICU collations and UTF8_LCASE
+ * (because it uses ICU mappings) the version is set to the version of the ICU library.
* When using ICU Collator this version is exposed through collator.getVersion().
* Whenever the collation is updated, the version should be updated as well or kept
* for backwards compatibility.
@@ -133,13 +148,31 @@ public static class Collation {
public final boolean supportsBinaryOrdering;
/**
- * Support for Lowercase Equality implies that it is possible to check equality on
- * byte by byte level, but only after calling "UTF8String.toLowerCase" on both arguments.
+ * Support for Lowercase Equality implies that it is possible to check equality on byte by
+ * byte level, but only after calling "UTF8String.lowerCaseCodePoints" on both arguments.
* This allows custom collation support for UTF8_LCASE collation in various Spark
* expressions, as this particular collation is not supported by the external ICU library.
*/
public final boolean supportsLowercaseEquality;
+ /**
+ * Support for Space Trimming implies that that based on specifier (for now only right trim)
+ * leading, trailing or both spaces are removed from the input string before comparison.
+ */
+ public final boolean supportsSpaceTrimming;
+
+ /**
+ * Is Utf8 binary type as indicator if collation base type is UTF8 binary. Note currently only
+ * collations Utf8_Binary and Utf8_Binary_RTRIM are considered as Utf8 binary type.
+ */
+ public final boolean isUtf8BinaryType;
+
+ /**
+ * Is Utf8 lcase type as indicator if collation base type is UTF8 lcase. Note currently only
+ * collations Utf8_Lcase and Utf8_Lcase_RTRIM are considered as Utf8 Lcase type.
+ */
+ public final boolean isUtf8LcaseType;
+
public Collation(
String collationName,
String provider,
@@ -147,31 +180,27 @@ public Collation(
Comparator comparator,
String version,
ToLongFunction hashFunction,
- boolean supportsBinaryEquality,
- boolean supportsBinaryOrdering,
- boolean supportsLowercaseEquality) {
+ BiFunction equalsFunction,
+ boolean isUtf8BinaryType,
+ boolean isUtf8LcaseType,
+ boolean supportsSpaceTrimming) {
this.collationName = collationName;
this.provider = provider;
this.collator = collator;
this.comparator = comparator;
this.version = version;
this.hashFunction = hashFunction;
- this.supportsBinaryEquality = supportsBinaryEquality;
- this.supportsBinaryOrdering = supportsBinaryOrdering;
- this.supportsLowercaseEquality = supportsLowercaseEquality;
-
- // De Morgan's Law to check supportsBinaryOrdering => supportsBinaryEquality
- assert(!supportsBinaryOrdering || supportsBinaryEquality);
+ this.isUtf8BinaryType = isUtf8BinaryType;
+ this.isUtf8LcaseType = isUtf8LcaseType;
+ this.equalsFunction = equalsFunction;
+ this.supportsSpaceTrimming = supportsSpaceTrimming;
+ this.supportsBinaryEquality = !supportsSpaceTrimming && isUtf8BinaryType;
+ this.supportsBinaryOrdering = !supportsSpaceTrimming && isUtf8BinaryType;
+ this.supportsLowercaseEquality = !supportsSpaceTrimming && isUtf8LcaseType;
// No Collation can simultaneously support binary equality and lowercase equality
assert(!supportsBinaryEquality || !supportsLowercaseEquality);
assert(SUPPORTED_PROVIDERS.contains(provider));
-
- if (supportsBinaryEquality) {
- this.equalsFunction = UTF8String::equals;
- } else {
- this.equalsFunction = (s1, s2) -> this.comparator.compare(s1, s2) == 0;
- }
}
/**
@@ -186,7 +215,8 @@ public Collation(
* bit 29: 0 for UTF8_BINARY, 1 for ICU collations.
* bit 28-24: Reserved.
* bit 23-22: Reserved for version.
- * bit 21-18: Reserved for space trimming.
+ * bit 21-19 Zeros, reserved for future trimmings.
+ * bit 18 0 = none, 1 = right trim.
* bit 17-0: Depend on collation family.
* ---
* INDETERMINATE collation ID binary layout:
@@ -201,7 +231,8 @@ public Collation(
* UTF8_BINARY collation ID binary layout:
* bit 31-24: Zeroes.
* bit 23-22: Zeroes, reserved for version.
- * bit 21-18: Zeroes, reserved for space trimming.
+ * bit 21-19 Zeros, reserved for future trimmings.
+ * bit 18 0 = none, 1 = right trim.
* bit 17-3: Zeroes.
* bit 2: 0, reserved for accent sensitivity.
* bit 1: 0, reserved for uppercase and case-insensitive.
@@ -212,7 +243,8 @@ public Collation(
* bit 29: 1
* bit 28-24: Zeroes.
* bit 23-22: Zeroes, reserved for version.
- * bit 21-18: Zeroes, reserved for space trimming.
+ * bit 21-18: Reserved for space trimming.
+ * 0000 = none, 0001 = right trim. Bits 21-19 remain reserved and fixed to 0.
* bit 17: 0 = case-sensitive, 1 = case-insensitive.
* bit 16: 0 = accent-sensitive, 1 = accent-insensitive.
* bit 15-14: Zeroes, reserved for punctuation sensitivity.
@@ -220,14 +252,20 @@ public Collation(
* bit 11-0: Locale ID as specified in `ICULocaleToId` mapping.
* ---
* Some illustrative examples of collation name to ID mapping:
- * - UTF8_BINARY -> 0
- * - UTF8_LCASE -> 1
- * - UNICODE -> 0x20000000
- * - UNICODE_AI -> 0x20010000
- * - UNICODE_CI -> 0x20020000
- * - UNICODE_CI_AI -> 0x20030000
- * - af -> 0x20000001
- * - af_CI_AI -> 0x20030001
+ * - UTF8_BINARY -> 0
+ * - UTF8_BINARY_RTRIM -> 0x00040000
+ * - UTF8_LCASE -> 1
+ * - UTF8_LCASE_RTRIM -> 0x00040001
+ * - UNICODE -> 0x20000000
+ * - UNICODE_AI -> 0x20010000
+ * - UNICODE_CI -> 0x20020000
+ * - UNICODE_RTRIM -> 0x20040000
+ * - UNICODE_CI_AI -> 0x20030000
+ * - UNICODE_CI_RTRIM -> 0x20060000
+ * - UNICODE_AI_RTRIM -> 0x20050000
+ * - UNICODE_CI_AI_RTRIM-> 0x20070000
+ * - af -> 0x20000001
+ * - af_CI_AI -> 0x20030001
*/
private abstract static class CollationSpec {
@@ -246,6 +284,14 @@ protected enum ImplementationProvider {
UTF8_BINARY, ICU
}
+ /**
+ * Bit 18 in collation ID having value 0 for none and 1 for right trimming.
+ * Bits 21, 20, 19 remained reserved (and fixed to 0) for future use.
+ */
+ protected enum SpaceTrimming {
+ NONE, RTRIM
+ }
+
/**
* Offset in binary collation ID layout.
*/
@@ -266,6 +312,17 @@ protected enum ImplementationProvider {
*/
protected static final int IMPLEMENTATION_PROVIDER_MASK = 0b1;
+
+ /**
+ * Offset in binary collation ID layout.
+ */
+ protected static final int SPACE_TRIMMING_OFFSET = 18;
+
+ /**
+ * Bitmask corresponding to width in bits in binary collation ID layout.
+ */
+ protected static final int SPACE_TRIMMING_MASK = 0b1;
+
private static final int INDETERMINATE_COLLATION_ID = -1;
/**
@@ -290,6 +347,45 @@ private static DefinitionOrigin getDefinitionOrigin(int collationId) {
DEFINITION_ORIGIN_OFFSET, DEFINITION_ORIGIN_MASK)];
}
+ /**
+ * Utility function to retrieve `SpaceTrimming` enum instance from collation ID.
+ */
+ protected static SpaceTrimming getSpaceTrimming(int collationId) {
+ return SpaceTrimming.values()[SpecifierUtils.getSpecValue(collationId,
+ SPACE_TRIMMING_OFFSET, SPACE_TRIMMING_MASK)];
+ }
+
+ protected static UTF8String applyTrimmingPolicy(UTF8String s, int collationId) {
+ return applyTrimmingPolicy(s, getSpaceTrimming(collationId));
+ }
+
+ /**
+ * Returns if leading/trailing spaces should be ignored in trim string expressions. This is
+ * needed because space trimming collation directly changes behaviour of trim functions.
+ */
+ protected static boolean ignoresSpacesInTrimFunctions(
+ int collationId,
+ boolean isLTrim,
+ boolean isRTrim) {
+ if (isRTrim && getSpaceTrimming(collationId) == SpaceTrimming.RTRIM) {
+ return true;
+ }
+
+ // In case of adding new trimming collations in the future (LTRIM and TRIM) here logic
+ // should be added.
+ return false;
+ }
+
+ /**
+ * Utility function to trim spaces when collation uses space trimming.
+ */
+ protected static UTF8String applyTrimmingPolicy(UTF8String s, SpaceTrimming spaceTrimming) {
+ if(spaceTrimming == SpaceTrimming.RTRIM){
+ return s.trimRight();
+ }
+ return s; // No trimming.
+ }
+
/**
* Main entry point for retrieving `Collation` instance from collation ID.
*/
@@ -342,6 +438,25 @@ private static int collationNameToId(String collationName) throws SparkException
}
protected abstract Collation buildCollation();
+
+ protected abstract CollationMeta buildCollationMeta();
+
+ protected abstract String normalizedCollationName();
+
+ static List listCollations() {
+ return Stream.concat(
+ CollationSpecUTF8.listCollations().stream(),
+ CollationSpecICU.listCollations().stream()).toList();
+ }
+
+ static CollationMeta loadCollationMeta(CollationIdentifier collationIdentifier) {
+ CollationMeta collationSpecUTF8 =
+ CollationSpecUTF8.loadCollationMeta(collationIdentifier);
+ if (collationSpecUTF8 == null) {
+ return CollationSpecICU.loadCollationMeta(collationIdentifier);
+ }
+ return collationSpecUTF8;
+ }
}
private static class CollationSpecUTF8 extends CollationSpec {
@@ -364,68 +479,227 @@ private enum CaseSensitivity {
*/
private static final int CASE_SENSITIVITY_MASK = 0b1;
+ private static final String UTF8_BINARY_COLLATION_NAME = "UTF8_BINARY";
+ private static final String UTF8_LCASE_COLLATION_NAME = "UTF8_LCASE";
+
private static final int UTF8_BINARY_COLLATION_ID =
- new CollationSpecUTF8(CaseSensitivity.UNSPECIFIED).collationId;
+ new CollationSpecUTF8(CaseSensitivity.UNSPECIFIED, SpaceTrimming.NONE).collationId;
private static final int UTF8_LCASE_COLLATION_ID =
- new CollationSpecUTF8(CaseSensitivity.LCASE).collationId;
+ new CollationSpecUTF8(CaseSensitivity.LCASE, SpaceTrimming.NONE).collationId;
protected static Collation UTF8_BINARY_COLLATION =
- new CollationSpecUTF8(CaseSensitivity.UNSPECIFIED).buildCollation();
+ new CollationSpecUTF8(CaseSensitivity.UNSPECIFIED, SpaceTrimming.NONE).buildCollation();
protected static Collation UTF8_LCASE_COLLATION =
- new CollationSpecUTF8(CaseSensitivity.LCASE).buildCollation();
+ new CollationSpecUTF8(CaseSensitivity.LCASE, SpaceTrimming.NONE).buildCollation();
+ private final CaseSensitivity caseSensitivity;
+ private final SpaceTrimming spaceTrimming;
private final int collationId;
- private CollationSpecUTF8(CaseSensitivity caseSensitivity) {
- this.collationId =
+ private CollationSpecUTF8(
+ CaseSensitivity caseSensitivity,
+ SpaceTrimming spaceTrimming) {
+ this.caseSensitivity = caseSensitivity;
+ this.spaceTrimming = spaceTrimming;
+
+ int collationId =
SpecifierUtils.setSpecValue(0, CASE_SENSITIVITY_OFFSET, caseSensitivity);
+ this.collationId =
+ SpecifierUtils.setSpecValue(collationId, SPACE_TRIMMING_OFFSET, spaceTrimming);
}
private static int collationNameToId(String originalName, String collationName)
throws SparkException {
- if (UTF8_BINARY_COLLATION.collationName.equals(collationName)) {
- return UTF8_BINARY_COLLATION_ID;
- } else if (UTF8_LCASE_COLLATION.collationName.equals(collationName)) {
- return UTF8_LCASE_COLLATION_ID;
+
+ int baseId;
+ String collationNamePrefix;
+
+ if (collationName.startsWith(UTF8_BINARY_COLLATION.collationName)) {
+ baseId = UTF8_BINARY_COLLATION_ID;
+ collationNamePrefix = UTF8_BINARY_COLLATION.collationName;
+ } else if (collationName.startsWith(UTF8_LCASE_COLLATION.collationName)) {
+ baseId = UTF8_LCASE_COLLATION_ID;
+ collationNamePrefix = UTF8_LCASE_COLLATION.collationName;
} else {
// Throw exception with original (before case conversion) collation name.
throw collationInvalidNameException(originalName);
}
+
+ String remainingSpecifiers = collationName.substring(collationNamePrefix.length());
+ if(remainingSpecifiers.isEmpty()) {
+ return baseId;
+ }
+ if(!remainingSpecifiers.startsWith("_")){
+ throw collationInvalidNameException(originalName);
+ }
+
+ SpaceTrimming spaceTrimming = SpaceTrimming.NONE;
+ String remainingSpec = remainingSpecifiers.substring(1);
+ if (remainingSpec.equals("RTRIM")) {
+ spaceTrimming = SpaceTrimming.RTRIM;
+ } else {
+ throw collationInvalidNameException(originalName);
+ }
+
+ return SpecifierUtils.setSpecValue(baseId, SPACE_TRIMMING_OFFSET, spaceTrimming);
}
private static CollationSpecUTF8 fromCollationId(int collationId) {
// Extract case sensitivity from collation ID.
int caseConversionOrdinal = SpecifierUtils.getSpecValue(collationId,
CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK);
- // Verify only case sensitivity bits were set settable in UTF8_BINARY family of collations.
- assert (SpecifierUtils.removeSpec(collationId,
- CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK) == 0);
- return new CollationSpecUTF8(CaseSensitivity.values()[caseConversionOrdinal]);
+ // Extract space trimming from collation ID.
+ int spaceTrimmingOrdinal = getSpaceTrimming(collationId).ordinal();
+ assert(isValidCollationId(collationId));
+ return new CollationSpecUTF8(
+ CaseSensitivity.values()[caseConversionOrdinal],
+ SpaceTrimming.values()[spaceTrimmingOrdinal]);
+ }
+
+ private static boolean isValidCollationId(int collationId) {
+ collationId = SpecifierUtils.removeSpec(
+ collationId,
+ SPACE_TRIMMING_OFFSET,
+ SPACE_TRIMMING_MASK);
+ collationId = SpecifierUtils.removeSpec(
+ collationId,
+ CASE_SENSITIVITY_OFFSET,
+ CASE_SENSITIVITY_MASK);
+ return collationId == 0;
}
@Override
protected Collation buildCollation() {
- if (collationId == UTF8_BINARY_COLLATION_ID) {
+ if (caseSensitivity == CaseSensitivity.UNSPECIFIED) {
+ Comparator comparator;
+ ToLongFunction hashFunction;
+ BiFunction equalsFunction;
+ boolean supportsSpaceTrimming = spaceTrimming != SpaceTrimming.NONE;
+
+ if (spaceTrimming == SpaceTrimming.NONE) {
+ comparator = UTF8String::binaryCompare;
+ hashFunction = s -> (long) s.hashCode();
+ equalsFunction = UTF8String::equals;
+ } else {
+ comparator = (s1, s2) -> applyTrimmingPolicy(s1, spaceTrimming).binaryCompare(
+ applyTrimmingPolicy(s2, spaceTrimming));
+ hashFunction = s -> (long) applyTrimmingPolicy(s, spaceTrimming).hashCode();
+ equalsFunction = (s1, s2) -> applyTrimmingPolicy(s1, spaceTrimming).equals(
+ applyTrimmingPolicy(s2, spaceTrimming));
+ }
+
return new Collation(
- "UTF8_BINARY",
+ normalizedCollationName(),
PROVIDER_SPARK,
null,
- UTF8String::binaryCompare,
- "1.0",
- s -> (long) s.hashCode(),
- /* supportsBinaryEquality = */ true,
- /* supportsBinaryOrdering = */ true,
- /* supportsLowercaseEquality = */ false);
+ comparator,
+ CollationSpecICU.ICU_VERSION,
+ hashFunction,
+ equalsFunction,
+ /* isUtf8BinaryType = */ true,
+ /* isUtf8LcaseType = */ false,
+ spaceTrimming != SpaceTrimming.NONE);
} else {
+ Comparator comparator;
+ ToLongFunction hashFunction;
+
+ if (spaceTrimming == SpaceTrimming.NONE) {
+ comparator = CollationAwareUTF8String::compareLowerCase;
+ hashFunction = s ->
+ (long) CollationAwareUTF8String.lowerCaseCodePoints(s).hashCode();
+ } else {
+ comparator = (s1, s2) -> CollationAwareUTF8String.compareLowerCase(
+ applyTrimmingPolicy(s1, spaceTrimming),
+ applyTrimmingPolicy(s2, spaceTrimming));
+ hashFunction = s -> (long) CollationAwareUTF8String.lowerCaseCodePoints(
+ applyTrimmingPolicy(s, spaceTrimming)).hashCode();
+ }
+
return new Collation(
- "UTF8_LCASE",
+ normalizedCollationName(),
PROVIDER_SPARK,
null,
- CollationAwareUTF8String::compareLowerCase,
- "1.0",
- s -> (long) CollationAwareUTF8String.lowerCaseCodePoints(s).hashCode(),
- /* supportsBinaryEquality = */ false,
- /* supportsBinaryOrdering = */ false,
- /* supportsLowercaseEquality = */ true);
+ comparator,
+ CollationSpecICU.ICU_VERSION,
+ hashFunction,
+ (s1, s2) -> comparator.compare(s1, s2) == 0,
+ /* isUtf8BinaryType = */ false,
+ /* isUtf8LcaseType = */ true,
+ spaceTrimming != SpaceTrimming.NONE);
+ }
+ }
+
+ @Override
+ protected CollationMeta buildCollationMeta() {
+ if (caseSensitivity == CaseSensitivity.UNSPECIFIED) {
+ return new CollationMeta(
+ CATALOG,
+ SCHEMA,
+ normalizedCollationName(),
+ /* language = */ null,
+ /* country = */ null,
+ /* icuVersion = */ null,
+ COLLATION_PAD_ATTRIBUTE,
+ /* accentSensitivity = */ true,
+ /* caseSensitivity = */ true,
+ spaceTrimming.toString());
+ } else {
+ return new CollationMeta(
+ CATALOG,
+ SCHEMA,
+ normalizedCollationName(),
+ /* language = */ null,
+ /* country = */ null,
+ /* icuVersion = */ null,
+ COLLATION_PAD_ATTRIBUTE,
+ /* accentSensitivity = */ true,
+ /* caseSensitivity = */ false,
+ spaceTrimming.toString());
+ }
+ }
+
+ /**
+ * Compute normalized collation name. Components of collation name are given in order:
+ * - Base collation name (UTF8_BINARY or UTF8_LCASE)
+ * - Optional space trimming when non-default preceded by underscore
+ * Examples: UTF8_BINARY, UTF8_BINARY_LCASE_LTRIM, UTF8_BINARY_TRIM.
+ */
+ @Override
+ protected String normalizedCollationName() {
+ StringBuilder builder = new StringBuilder();
+ if(caseSensitivity == CaseSensitivity.UNSPECIFIED){
+ builder.append(UTF8_BINARY_COLLATION_NAME);
+ } else{
+ builder.append(UTF8_LCASE_COLLATION_NAME);
+ }
+ if (spaceTrimming != SpaceTrimming.NONE) {
+ builder.append('_');
+ builder.append(spaceTrimming.toString());
+ }
+ return builder.toString();
+ }
+
+ static List listCollations() {
+ CollationIdentifier UTF8_BINARY_COLLATION_IDENT = new CollationIdentifier(
+ PROVIDER_SPARK,
+ UTF8_BINARY_COLLATION_NAME,
+ CollationSpecICU.ICU_VERSION
+ );
+ CollationIdentifier UTF8_LCASE_COLLATION_IDENT = new CollationIdentifier(
+ PROVIDER_SPARK,
+ UTF8_LCASE_COLLATION_NAME,
+ CollationSpecICU.ICU_VERSION
+ );
+ return Arrays.asList(UTF8_BINARY_COLLATION_IDENT, UTF8_LCASE_COLLATION_IDENT);
+ }
+
+ static CollationMeta loadCollationMeta(CollationIdentifier collationIdentifier) {
+ try {
+ int collationId = CollationSpecUTF8.collationNameToId(
+ collationIdentifier.name, collationIdentifier.name.toUpperCase());
+ return CollationSpecUTF8.fromCollationId(collationId).buildCollationMeta();
+ } catch (SparkException ignored) {
+ // ignore
+ return null;
}
}
}
@@ -489,9 +763,11 @@ private enum AccentSensitivity {
private static final Map ICULocaleToId = new HashMap<>();
/**
- * ICU library Collator version passed to `Collation` instance.
+ * ICU library version.
*/
- private static final String ICU_COLLATOR_VERSION = "153.120.0.0";
+ private static final String ICU_VERSION = String.format("%d.%d",
+ VersionInfo.ICU_VERSION.getMajor(),
+ VersionInfo.ICU_VERSION.getMinor());
static {
ICULocaleMap.put("UNICODE", ULocale.ROOT);
@@ -541,21 +817,33 @@ private enum AccentSensitivity {
}
}
- private static final int UNICODE_COLLATION_ID =
- new CollationSpecICU("UNICODE", CaseSensitivity.CS, AccentSensitivity.AS).collationId;
- private static final int UNICODE_CI_COLLATION_ID =
- new CollationSpecICU("UNICODE", CaseSensitivity.CI, AccentSensitivity.AS).collationId;
+ private static final int UNICODE_COLLATION_ID = new CollationSpecICU(
+ "UNICODE",
+ CaseSensitivity.CS,
+ AccentSensitivity.AS,
+ SpaceTrimming.NONE).collationId;
+
+ private static final int UNICODE_CI_COLLATION_ID = new CollationSpecICU(
+ "UNICODE",
+ CaseSensitivity.CI,
+ AccentSensitivity.AS,
+ SpaceTrimming.NONE).collationId;
private final CaseSensitivity caseSensitivity;
private final AccentSensitivity accentSensitivity;
+ private final SpaceTrimming spaceTrimming;
private final String locale;
private final int collationId;
- private CollationSpecICU(String locale, CaseSensitivity caseSensitivity,
- AccentSensitivity accentSensitivity) {
+ private CollationSpecICU(
+ String locale,
+ CaseSensitivity caseSensitivity,
+ AccentSensitivity accentSensitivity,
+ SpaceTrimming spaceTrimming) {
this.locale = locale;
this.caseSensitivity = caseSensitivity;
this.accentSensitivity = accentSensitivity;
+ this.spaceTrimming = spaceTrimming;
// Construct collation ID from locale, case-sensitivity and accent-sensitivity specifiers.
int collationId = ICULocaleToId.get(locale);
// Mandatory ICU implementation provider.
@@ -565,6 +853,8 @@ private CollationSpecICU(String locale, CaseSensitivity caseSensitivity,
caseSensitivity);
collationId = SpecifierUtils.setSpecValue(collationId, ACCENT_SENSITIVITY_OFFSET,
accentSensitivity);
+ collationId = SpecifierUtils.setSpecValue(collationId, SPACE_TRIMMING_OFFSET,
+ spaceTrimming);
this.collationId = collationId;
}
@@ -582,58 +872,86 @@ private static int collationNameToId(
}
if (lastPos == -1) {
throw collationInvalidNameException(originalName);
- } else {
- String locale = collationName.substring(0, lastPos);
- int collationId = ICULocaleToId.get(ICULocaleMapUppercase.get(locale));
-
- // Try all combinations of AS/AI and CS/CI.
- CaseSensitivity caseSensitivity;
- AccentSensitivity accentSensitivity;
- if (collationName.equals(locale) ||
- collationName.equals(locale + "_AS") ||
- collationName.equals(locale + "_CS") ||
- collationName.equals(locale + "_AS_CS") ||
- collationName.equals(locale + "_CS_AS")
- ) {
- caseSensitivity = CaseSensitivity.CS;
- accentSensitivity = AccentSensitivity.AS;
- } else if (collationName.equals(locale + "_CI") ||
- collationName.equals(locale + "_AS_CI") ||
- collationName.equals(locale + "_CI_AS")) {
- caseSensitivity = CaseSensitivity.CI;
- accentSensitivity = AccentSensitivity.AS;
- } else if (collationName.equals(locale + "_AI") ||
- collationName.equals(locale + "_CS_AI") ||
- collationName.equals(locale + "_AI_CS")) {
- caseSensitivity = CaseSensitivity.CS;
- accentSensitivity = AccentSensitivity.AI;
- } else if (collationName.equals(locale + "_AI_CI") ||
- collationName.equals(locale + "_CI_AI")) {
- caseSensitivity = CaseSensitivity.CI;
- accentSensitivity = AccentSensitivity.AI;
- } else {
- throw collationInvalidNameException(originalName);
- }
+ }
+ String locale = collationName.substring(0, lastPos);
+ int collationId = ICULocaleToId.get(ICULocaleMapUppercase.get(locale));
+ collationId = SpecifierUtils.setSpecValue(collationId,
+ IMPLEMENTATION_PROVIDER_OFFSET, ImplementationProvider.ICU);
- // Build collation ID from computed specifiers.
- collationId = SpecifierUtils.setSpecValue(collationId,
- IMPLEMENTATION_PROVIDER_OFFSET, ImplementationProvider.ICU);
- collationId = SpecifierUtils.setSpecValue(collationId,
- CASE_SENSITIVITY_OFFSET, caseSensitivity);
- collationId = SpecifierUtils.setSpecValue(collationId,
- ACCENT_SENSITIVITY_OFFSET, accentSensitivity);
+ // No other specifiers present.
+ if(collationName.equals(locale)){
return collationId;
}
+ if(collationName.charAt(locale.length()) != '_'){
+ throw collationInvalidNameException(originalName);
+ }
+ // Extract remaining specifiers and trim "_" separator.
+ String remainingSpecifiers = collationName.substring(lastPos + 1);
+
+ // Initialize default specifier flags.
+ // Case sensitive, accent sensitive, no space trimming.
+ boolean isCaseSpecifierSet = false;
+ boolean isAccentSpecifierSet = false;
+ boolean isSpaceTrimmingSpecifierSet = false;
+ CaseSensitivity caseSensitivity = CaseSensitivity.CS;
+ AccentSensitivity accentSensitivity = AccentSensitivity.AS;
+ SpaceTrimming spaceTrimming = SpaceTrimming.NONE;
+
+ String[] specifiers = remainingSpecifiers.split("_");
+
+ // Iterate through specifiers and set corresponding flags
+ for (String specifier : specifiers) {
+ switch (specifier) {
+ case "CI":
+ case "CS":
+ if (isCaseSpecifierSet) {
+ throw collationInvalidNameException(originalName);
+ }
+ caseSensitivity = CaseSensitivity.valueOf(specifier);
+ isCaseSpecifierSet = true;
+ break;
+ case "AI":
+ case "AS":
+ if (isAccentSpecifierSet) {
+ throw collationInvalidNameException(originalName);
+ }
+ accentSensitivity = AccentSensitivity.valueOf(specifier);
+ isAccentSpecifierSet = true;
+ break;
+ case "RTRIM":
+ if (isSpaceTrimmingSpecifierSet) {
+ throw collationInvalidNameException(originalName);
+ }
+ spaceTrimming = SpaceTrimming.valueOf(specifier);
+ isSpaceTrimmingSpecifierSet = true;
+ break;
+ default:
+ throw collationInvalidNameException(originalName);
+ }
+ }
+
+ // Build collation ID from computed specifiers.
+ collationId = SpecifierUtils.setSpecValue(collationId,
+ CASE_SENSITIVITY_OFFSET, caseSensitivity);
+ collationId = SpecifierUtils.setSpecValue(collationId,
+ ACCENT_SENSITIVITY_OFFSET, accentSensitivity);
+ collationId = SpecifierUtils.setSpecValue(collationId,
+ SPACE_TRIMMING_OFFSET, spaceTrimming);
+ return collationId;
}
private static CollationSpecICU fromCollationId(int collationId) {
// Parse specifiers from collation ID.
+ int spaceTrimmingOrdinal = SpecifierUtils.getSpecValue(collationId,
+ SPACE_TRIMMING_OFFSET, SPACE_TRIMMING_MASK);
int caseSensitivityOrdinal = SpecifierUtils.getSpecValue(collationId,
CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK);
int accentSensitivityOrdinal = SpecifierUtils.getSpecValue(collationId,
ACCENT_SENSITIVITY_OFFSET, ACCENT_SENSITIVITY_MASK);
collationId = SpecifierUtils.removeSpec(collationId,
IMPLEMENTATION_PROVIDER_OFFSET, IMPLEMENTATION_PROVIDER_MASK);
+ collationId = SpecifierUtils.removeSpec(collationId,
+ SPACE_TRIMMING_OFFSET, SPACE_TRIMMING_MASK);
collationId = SpecifierUtils.removeSpec(collationId,
CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK);
collationId = SpecifierUtils.removeSpec(collationId,
@@ -644,8 +962,9 @@ private static CollationSpecICU fromCollationId(int collationId) {
assert(localeId >= 0 && localeId < ICULocaleNames.length);
CaseSensitivity caseSensitivity = CaseSensitivity.values()[caseSensitivityOrdinal];
AccentSensitivity accentSensitivity = AccentSensitivity.values()[accentSensitivityOrdinal];
+ SpaceTrimming spaceTrimming = SpaceTrimming.values()[spaceTrimmingOrdinal];
String locale = ICULocaleNames[localeId];
- return new CollationSpecICU(locale, caseSensitivity, accentSensitivity);
+ return new CollationSpecICU(locale, caseSensitivity, accentSensitivity, spaceTrimming);
}
@Override
@@ -672,16 +991,51 @@ protected Collation buildCollation() {
Collator collator = Collator.getInstance(resultLocale);
// Freeze ICU collator to ensure thread safety.
collator.freeze();
+
+ Comparator comparator;
+ ToLongFunction hashFunction;
+
+ if (spaceTrimming == SpaceTrimming.NONE) {
+ hashFunction = s -> (long) collator.getCollationKey(
+ s.toValidString()).hashCode();
+ comparator = (s1, s2) ->
+ collator.compare(s1.toValidString(), s2.toValidString());
+ } else {
+ comparator = (s1, s2) -> collator.compare(
+ applyTrimmingPolicy(s1, spaceTrimming).toValidString(),
+ applyTrimmingPolicy(s2, spaceTrimming).toValidString());
+ hashFunction = s -> (long) collator.getCollationKey(
+ applyTrimmingPolicy(s, spaceTrimming).toValidString()).hashCode();
+ }
+
return new Collation(
- collationName(),
+ normalizedCollationName(),
PROVIDER_ICU,
collator,
- (s1, s2) -> collator.compare(s1.toString(), s2.toString()),
- ICU_COLLATOR_VERSION,
- s -> (long) collator.getCollationKey(s.toString()).hashCode(),
- /* supportsBinaryEquality = */ false,
- /* supportsBinaryOrdering = */ false,
- /* supportsLowercaseEquality = */ false);
+ comparator,
+ ICU_VERSION,
+ hashFunction,
+ (s1, s2) -> comparator.compare(s1, s2) == 0,
+ /* isUtf8BinaryType = */ false,
+ /* isUtf8LcaseType = */ false,
+ spaceTrimming != SpaceTrimming.NONE);
+ }
+
+ @Override
+ protected CollationMeta buildCollationMeta() {
+ String language = ICULocaleMap.get(locale).getDisplayLanguage();
+ String country = ICULocaleMap.get(locale).getDisplayCountry();
+ return new CollationMeta(
+ CATALOG,
+ SCHEMA,
+ normalizedCollationName(),
+ language.isEmpty() ? null : language,
+ country.isEmpty() ? null : country,
+ VersionInfo.ICU_VERSION.toString(),
+ COLLATION_PAD_ATTRIBUTE,
+ accentSensitivity == AccentSensitivity.AS,
+ caseSensitivity == CaseSensitivity.CS,
+ spaceTrimming.toString());
}
/**
@@ -689,9 +1043,11 @@ protected Collation buildCollation() {
* - Locale name
* - Optional case sensitivity when non-default preceded by underscore
* - Optional accent sensitivity when non-default preceded by underscore
- * Examples: en, en_USA_CI_AI, sr_Cyrl_SRB_AI.
+ * - Optional space trimming when non-default preceded by underscore
+ * Examples: en, en_USA_CI_LTRIM, en_USA_CI_AI, en_USA_CI_AI_TRIM, sr_Cyrl_SRB_AI.
*/
- private String collationName() {
+ @Override
+ protected String normalizedCollationName() {
StringBuilder builder = new StringBuilder();
builder.append(locale);
if (caseSensitivity != CaseSensitivity.CS) {
@@ -702,8 +1058,40 @@ private String collationName() {
builder.append('_');
builder.append(accentSensitivity.toString());
}
+ if(spaceTrimming != SpaceTrimming.NONE) {
+ builder.append('_');
+ builder.append(spaceTrimming.toString());
+ }
return builder.toString();
}
+
+ private static List allCollationNames() {
+ List collationNames = new ArrayList<>();
+ List caseAccentSpecifiers = Arrays.asList("", "_AI", "_CI", "_CI_AI");
+ for (String locale : ICULocaleToId.keySet()) {
+ for (String caseAccent : caseAccentSpecifiers) {
+ String collationName = locale + caseAccent;
+ collationNames.add(collationName);
+ }
+ }
+ return collationNames.stream().sorted().toList();
+ }
+
+ static List listCollations() {
+ return allCollationNames().stream().map(name ->
+ new CollationIdentifier(PROVIDER_ICU, name, VersionInfo.ICU_VERSION.toString())).toList();
+ }
+
+ static CollationMeta loadCollationMeta(CollationIdentifier collationIdentifier) {
+ try {
+ int collationId = CollationSpecICU.collationNameToId(
+ collationIdentifier.name, collationIdentifier.name.toUpperCase());
+ return CollationSpecICU.fromCollationId(collationId).buildCollationMeta();
+ } catch (SparkException ignored) {
+ // ignore
+ return null;
+ }
+ }
}
/**
@@ -730,9 +1118,12 @@ public CollationIdentifier identifier() {
}
}
+ public static final String CATALOG = "SYSTEM";
+ public static final String SCHEMA = "BUILTIN";
public static final String PROVIDER_SPARK = "spark";
public static final String PROVIDER_ICU = "icu";
public static final List SUPPORTED_PROVIDERS = List.of(PROVIDER_SPARK, PROVIDER_ICU);
+ public static final String COLLATION_PAD_ATTRIBUTE = "NO_PAD";
public static final int UTF8_BINARY_COLLATION_ID =
Collation.CollationSpecUTF8.UTF8_BINARY_COLLATION_ID;
@@ -749,12 +1140,15 @@ public CollationIdentifier identifier() {
* Returns a StringSearch object for the given pattern and target strings, under collation
* rules corresponding to the given collationId. The external ICU library StringSearch object can
* be used to find occurrences of the pattern in the target string, while respecting collation.
+ * When given invalid UTF8Strings, the method will first convert them to valid strings, and then
+ * instantiate the StringSearch object. However, original UTF8Strings will remain unchanged.
*/
public static StringSearch getStringSearch(
final UTF8String targetUTF8String,
final UTF8String patternUTF8String,
final int collationId) {
- return getStringSearch(targetUTF8String.toString(), patternUTF8String.toString(), collationId);
+ return getStringSearch(targetUTF8String.toValidString(), patternUTF8String.toValidString(),
+ collationId);
}
/**
@@ -763,9 +1157,9 @@ public static StringSearch getStringSearch(
* be used to find occurrences of the pattern in the target string, while respecting collation.
*/
public static StringSearch getStringSearch(
- final String targetString,
- final String patternString,
- final int collationId) {
+ final String targetString,
+ final String patternString,
+ final int collationId) {
CharacterIterator target = new StringCharacterIterator(targetString);
Collator collator = CollationFactory.fetchCollation(collationId).collator;
return new StringSearch(patternString, target, (RuleBasedCollator) collator);
@@ -775,11 +1169,13 @@ public static StringSearch getStringSearch(
* Returns a collation-unaware StringSearch object for the given pattern and target strings.
* While this object does not respect collation, it can be used to find occurrences of the pattern
* in the target string for UTF8_BINARY or UTF8_LCASE (if arguments are lowercased).
+ * When given invalid UTF8Strings, the method will first convert them to valid strings, and then
+ * instantiate the StringSearch object. However, original UTF8Strings will remain unchanged.
*/
public static StringSearch getStringSearch(
- final UTF8String targetUTF8String,
- final UTF8String patternUTF8String) {
- return new StringSearch(patternUTF8String.toString(), targetUTF8String.toString());
+ final UTF8String targetUTF8String,
+ final UTF8String patternUTF8String) {
+ return new StringSearch(patternUTF8String.toValidString(), targetUTF8String.toValidString());
}
/**
@@ -789,6 +1185,16 @@ public static int collationNameToId(String collationName) throws SparkException
return Collation.CollationSpec.collationNameToId(collationName);
}
+ public static boolean isCaseInsensitive(int collationId) {
+ return Collation.CollationSpecICU.fromCollationId(collationId).caseSensitivity ==
+ Collation.CollationSpecICU.CaseSensitivity.CI;
+ }
+
+ public static boolean isAccentInsensitive(int collationId) {
+ return Collation.CollationSpecICU.fromCollationId(collationId).accentSensitivity ==
+ Collation.CollationSpecICU.AccentSensitivity.AI;
+ }
+
public static void assertValidProvider(String provider) throws SparkException {
if (!SUPPORTED_PROVIDERS.contains(provider.toLowerCase())) {
Map params = Map.of(
@@ -813,26 +1219,52 @@ public static String[] getICULocaleNames() {
return Collation.CollationSpecICU.ICULocaleNames;
}
+ /**
+ * Applies trimming policy depending up on trim collation type.
+ */
+ public static UTF8String applyTrimmingPolicy(UTF8String input, int collationId) {
+ return Collation.CollationSpec.applyTrimmingPolicy(input, collationId);
+ }
+
+ /**
+ * Returns if leading/trailing spaces should be ignored in trim string expressions. This is needed
+ * because space trimming collation directly changes behaviour of trim functions.
+ */
+ public static boolean ignoresSpacesInTrimFunctions(
+ int collationId,
+ boolean isLTrim,
+ boolean isRTrim) {
+ return Collation.CollationSpec.ignoresSpacesInTrimFunctions(collationId, isLTrim, isRTrim);
+ }
+
public static UTF8String getCollationKey(UTF8String input, int collationId) {
Collation collation = fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
+ if (collation.supportsSpaceTrimming) {
+ input = Collation.CollationSpec.applyTrimmingPolicy(input, collationId);
+ }
+ if (collation.isUtf8BinaryType) {
return input;
- } else if (collation.supportsLowercaseEquality) {
- return input.toLowerCase();
+ } else if (collation.isUtf8LcaseType) {
+ return CollationAwareUTF8String.lowerCaseCodePoints(input);
} else {
- CollationKey collationKey = collation.collator.getCollationKey(input.toString());
+ CollationKey collationKey = collation.collator.getCollationKey(
+ input.toValidString());
return UTF8String.fromBytes(collationKey.toByteArray());
}
}
public static byte[] getCollationKeyBytes(UTF8String input, int collationId) {
Collation collation = fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
+ if (collation.supportsSpaceTrimming) {
+ input = Collation.CollationSpec.applyTrimmingPolicy(input, collationId);
+ }
+ if (collation.isUtf8BinaryType) {
return input.getBytes();
- } else if (collation.supportsLowercaseEquality) {
- return input.toLowerCase().getBytes();
+ } else if (collation.isUtf8LcaseType) {
+ return CollationAwareUTF8String.lowerCaseCodePoints(input).getBytes();
} else {
- return collation.collator.getCollationKey(input.toString()).toByteArray();
+ return collation.collator.getCollationKey(
+ input.toValidString()).toByteArray();
}
}
@@ -848,19 +1280,26 @@ public static String getClosestSuggestionsOnInvalidName(
Collation.CollationSpecUTF8.UTF8_BINARY_COLLATION.collationName,
Collation.CollationSpecUTF8.UTF8_LCASE_COLLATION.collationName
};
- validModifiers = new String[0];
+ validModifiers = new String[]{"_RTRIM"};
} else {
validRootNames = getICULocaleNames();
- validModifiers = new String[]{"_CI", "_AI", "_CS", "_AS"};
+ validModifiers = new String[]{"_CI", "_AI", "_CS", "_AS", "_RTRIM"};
}
// Split modifiers and locale name.
- final int MODIFIER_LENGTH = 3;
+ boolean foundModifier = true;
String localeName = collationName.toUpperCase();
List modifiers = new ArrayList<>();
- while (Arrays.stream(validModifiers).anyMatch(localeName::endsWith)) {
- modifiers.add(localeName.substring(localeName.length() - MODIFIER_LENGTH));
- localeName = localeName.substring(0, localeName.length() - MODIFIER_LENGTH);
+ while (foundModifier) {
+ foundModifier = false;
+ for (String modifier : validModifiers) {
+ if (localeName.endsWith(modifier)) {
+ modifiers.add(modifier);
+ localeName = localeName.substring(0, localeName.length() - modifier.length());
+ foundModifier = true;
+ break;
+ }
+ }
}
// Suggest version with unique modifiers.
@@ -918,4 +1357,12 @@ public static String getClosestSuggestionsOnInvalidName(
return String.join(", ", suggestions);
}
+
+ public static List listCollations() {
+ return Collation.CollationSpec.listCollations();
+ }
+
+ public static CollationMeta loadCollationMeta(CollationIdentifier collationIdentifier) {
+ return Collation.CollationSpec.loadCollationMeta(collationIdentifier);
+ }
}
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
index 453423ddbc33d..135250e482b16 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -20,8 +20,6 @@
import org.apache.spark.unsafe.types.UTF8String;
-import java.util.ArrayList;
-import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
@@ -37,88 +35,67 @@ public final class CollationSupport {
*/
public static class StringSplitSQL {
- public static UTF8String[] exec(final UTF8String s, final UTF8String d, final int collationId) {
+ public static UTF8String[] exec(final UTF8String s, UTF8String d, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
+ if (collation.supportsSpaceTrimming) {
+ d = CollationFactory.applyTrimmingPolicy(d, collationId);
+ }
+ if (collation.isUtf8BinaryType) {
return execBinary(s, d);
- } else if (collation.supportsLowercaseEquality) {
+ } else if (collation.isUtf8LcaseType) {
return execLowercase(s, d);
} else {
return execICU(s, d, collationId);
}
}
public static String genCode(final String s, final String d, final int collationId) {
- CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.StringSplitSQL.exec";
- if (collation.supportsBinaryEquality) {
+ if (collationId == CollationFactory.UTF8_BINARY_COLLATION_ID) {
return String.format(expr + "Binary(%s, %s)", s, d);
- } else if (collation.supportsLowercaseEquality) {
- return String.format(expr + "Lowercase(%s, %s)", s, d);
} else {
- return String.format(expr + "ICU(%s, %s, %d)", s, d, collationId);
+ return String.format(expr + "(%s, %s, %d)", s, d, collationId);
}
}
public static UTF8String[] execBinary(final UTF8String string, final UTF8String delimiter) {
return string.splitSQL(delimiter, -1);
}
public static UTF8String[] execLowercase(final UTF8String string, final UTF8String delimiter) {
- if (delimiter.numBytes() == 0) return new UTF8String[] { string };
- if (string.numBytes() == 0) return new UTF8String[] { UTF8String.EMPTY_UTF8 };
- Pattern pattern = Pattern.compile(Pattern.quote(delimiter.toString()),
- CollationSupport.lowercaseRegexFlags);
- String[] splits = pattern.split(string.toString(), -1);
- UTF8String[] res = new UTF8String[splits.length];
- for (int i = 0; i < res.length; i++) {
- res[i] = UTF8String.fromString(splits[i]);
- }
- return res;
+ return CollationAwareUTF8String.lowercaseSplitSQL(string, delimiter, -1);
}
public static UTF8String[] execICU(final UTF8String string, final UTF8String delimiter,
final int collationId) {
- if (delimiter.numBytes() == 0) return new UTF8String[] { string };
- if (string.numBytes() == 0) return new UTF8String[] { UTF8String.EMPTY_UTF8 };
- List strings = new ArrayList<>();
- String target = string.toString(), pattern = delimiter.toString();
- StringSearch stringSearch = CollationFactory.getStringSearch(target, pattern, collationId);
- int start = 0, end;
- while ((end = stringSearch.next()) != StringSearch.DONE) {
- strings.add(UTF8String.fromString(target.substring(start, end)));
- start = end + stringSearch.getMatchLength();
- }
- if (start <= target.length()) {
- strings.add(UTF8String.fromString(target.substring(start)));
- }
- return strings.toArray(new UTF8String[0]);
+ return CollationAwareUTF8String.icuSplitSQL(string, delimiter, -1, collationId);
}
}
public static class Contains {
- public static boolean exec(final UTF8String l, final UTF8String r, final int collationId) {
+ public static boolean exec(UTF8String l, UTF8String r, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
+ if (collation.supportsSpaceTrimming) {
+ l = CollationFactory.applyTrimmingPolicy(l, collationId);
+ r = CollationFactory.applyTrimmingPolicy(r, collationId);
+ }
+ if (collation.isUtf8BinaryType) {
return execBinary(l, r);
- } else if (collation.supportsLowercaseEquality) {
+ } else if (collation.isUtf8LcaseType) {
return execLowercase(l, r);
} else {
return execICU(l, r, collationId);
}
}
public static String genCode(final String l, final String r, final int collationId) {
- CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.Contains.exec";
- if (collation.supportsBinaryEquality) {
+ if (collationId == CollationFactory.UTF8_BINARY_COLLATION_ID) {
return String.format(expr + "Binary(%s, %s)", l, r);
- } else if (collation.supportsLowercaseEquality) {
- return String.format(expr + "Lowercase(%s, %s)", l, r);
} else {
- return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId);
+ return String.format(expr + "(%s, %s, %d)", l, r, collationId);
}
}
public static boolean execBinary(final UTF8String l, final UTF8String r) {
return l.contains(r);
}
public static boolean execLowercase(final UTF8String l, final UTF8String r) {
- return CollationAwareUTF8String.lowercaseIndexOf(l, r, 0) >= 0;
+ return CollationAwareUTF8String.lowercaseContains(l, r);
}
public static boolean execICU(final UTF8String l, final UTF8String r,
final int collationId) {
@@ -130,33 +107,35 @@ public static boolean execICU(final UTF8String l, final UTF8String r,
}
public static class StartsWith {
- public static boolean exec(final UTF8String l, final UTF8String r,
+ public static boolean exec(UTF8String l, UTF8String r,
final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
+ if (collation.supportsSpaceTrimming) {
+ l = CollationFactory.applyTrimmingPolicy(l, collationId);
+ r = CollationFactory.applyTrimmingPolicy(r, collationId);
+ }
+
+ if (collation.isUtf8BinaryType) {
return execBinary(l, r);
- } else if (collation.supportsLowercaseEquality) {
+ } else if (collation.isUtf8LcaseType) {
return execLowercase(l, r);
} else {
return execICU(l, r, collationId);
}
}
public static String genCode(final String l, final String r, final int collationId) {
- CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.StartsWith.exec";
- if (collation.supportsBinaryEquality) {
+ if (collationId == CollationFactory.UTF8_BINARY_COLLATION_ID) {
return String.format(expr + "Binary(%s, %s)", l, r);
- } else if (collation.supportsLowercaseEquality) {
- return String.format(expr + "Lowercase(%s, %s)", l, r);
} else {
- return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId);
+ return String.format(expr + "(%s, %s, %d)", l, r, collationId);
}
}
public static boolean execBinary(final UTF8String l, final UTF8String r) {
return l.startsWith(r);
}
public static boolean execLowercase(final UTF8String l, final UTF8String r) {
- return CollationAwareUTF8String.lowercaseMatchFrom(l, r.toLowerCase(), 0);
+ return CollationAwareUTF8String.lowercaseStartsWith(l, r);
}
public static boolean execICU(final UTF8String l, final UTF8String r,
final int collationId) {
@@ -168,32 +147,33 @@ public static boolean execICU(final UTF8String l, final UTF8String r,
}
public static class EndsWith {
- public static boolean exec(final UTF8String l, final UTF8String r, final int collationId) {
+ public static boolean exec(UTF8String l, UTF8String r, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
+ if (collation.supportsSpaceTrimming) {
+ l = CollationFactory.applyTrimmingPolicy(l, collationId);
+ r = CollationFactory.applyTrimmingPolicy(r, collationId);
+ }
+ if (collation.isUtf8BinaryType) {
return execBinary(l, r);
- } else if (collation.supportsLowercaseEquality) {
+ } else if (collation.isUtf8LcaseType) {
return execLowercase(l, r);
} else {
return execICU(l, r, collationId);
}
}
public static String genCode(final String l, final String r, final int collationId) {
- CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.EndsWith.exec";
- if (collation.supportsBinaryEquality) {
+ if (collationId == CollationFactory.UTF8_BINARY_COLLATION_ID) {
return String.format(expr + "Binary(%s, %s)", l, r);
- } else if (collation.supportsLowercaseEquality) {
- return String.format(expr + "Lowercase(%s, %s)", l, r);
} else {
- return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId);
+ return String.format(expr + "(%s, %s, %d)", l, r, collationId);
}
}
public static boolean execBinary(final UTF8String l, final UTF8String r) {
return l.endsWith(r);
}
public static boolean execLowercase(final UTF8String l, final UTF8String r) {
- return CollationAwareUTF8String.lowercaseMatchUntil(l, r.toLowerCase(), l.numChars());
+ return CollationAwareUTF8String.lowercaseEndsWith(l, r);
}
public static boolean execICU(final UTF8String l, final UTF8String r,
final int collationId) {
@@ -208,9 +188,10 @@ public static boolean execICU(final UTF8String l, final UTF8String r,
public static class Upper {
public static UTF8String exec(final UTF8String v, final int collationId, boolean useICU) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
+ // Space trimming does not affect the output of this expression.
+ if (collation.isUtf8BinaryType) {
return useICU ? execBinaryICU(v) : execBinary(v);
- } else if (collation.supportsLowercaseEquality) {
+ } else if (collation.isUtf8LcaseType) {
return execLowercase(v);
} else {
return execICU(v, collationId);
@@ -219,10 +200,10 @@ public static UTF8String exec(final UTF8String v, final int collationId, boolean
public static String genCode(final String v, final int collationId, boolean useICU) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.Upper.exec";
- if (collation.supportsBinaryEquality) {
+ if (collation.isUtf8BinaryType) {
String funcName = useICU ? "BinaryICU" : "Binary";
return String.format(expr + "%s(%s)", funcName, v);
- } else if (collation.supportsLowercaseEquality) {
+ } else if (collation.isUtf8LcaseType) {
return String.format(expr + "Lowercase(%s)", v);
} else {
return String.format(expr + "ICU(%s, %d)", v, collationId);
@@ -245,9 +226,10 @@ public static UTF8String execICU(final UTF8String v, final int collationId) {
public static class Lower {
public static UTF8String exec(final UTF8String v, final int collationId, boolean useICU) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
+ // Space trimming does not affect the output of this expression.
+ if (collation.isUtf8BinaryType) {
return useICU ? execBinaryICU(v) : execBinary(v);
- } else if (collation.supportsLowercaseEquality) {
+ } else if (collation.isUtf8LcaseType) {
return execLowercase(v);
} else {
return execICU(v, collationId);
@@ -256,10 +238,10 @@ public static UTF8String exec(final UTF8String v, final int collationId, boolean
public static String genCode(final String v, final int collationId, boolean useICU) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.Lower.exec";
- if (collation.supportsBinaryEquality) {
+ if (collation.isUtf8BinaryType) {
String funcName = useICU ? "BinaryICU" : "Binary";
return String.format(expr + "%s(%s)", funcName, v);
- } else if (collation.supportsLowercaseEquality) {
+ } else if (collation.isUtf8LcaseType) {
return String.format(expr + "Lowercase(%s)", v);
} else {
return String.format(expr + "ICU(%s, %d)", v, collationId);
@@ -282,9 +264,10 @@ public static UTF8String execICU(final UTF8String v, final int collationId) {
public static class InitCap {
public static UTF8String exec(final UTF8String v, final int collationId, boolean useICU) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
+ // Space trimming does not affect the output of this expression.
+ if (collation.isUtf8BinaryType) {
return useICU ? execBinaryICU(v) : execBinary(v);
- } else if (collation.supportsLowercaseEquality) {
+ } else if (collation.isUtf8LcaseType) {
return execLowercase(v);
} else {
return execICU(v, collationId);
@@ -294,10 +277,10 @@ public static UTF8String exec(final UTF8String v, final int collationId, boolean
public static String genCode(final String v, final int collationId, boolean useICU) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.InitCap.exec";
- if (collation.supportsBinaryEquality) {
+ if (collation.isUtf8BinaryType) {
String funcName = useICU ? "BinaryICU" : "Binary";
return String.format(expr + "%s(%s)", funcName, v);
- } else if (collation.supportsLowercaseEquality) {
+ } else if (collation.isUtf8LcaseType) {
return String.format(expr + "Lowercase(%s)", v);
} else {
return String.format(expr + "ICU(%s, %d)", v, collationId);
@@ -307,7 +290,7 @@ public static UTF8String execBinary(final UTF8String v) {
return v.toLowerCase().toTitleCase();
}
public static UTF8String execBinaryICU(final UTF8String v) {
- return CollationAwareUTF8String.toLowerCase(v).toTitleCaseICU();
+ return CollationAwareUTF8String.toTitleCaseICU(v);
}
public static UTF8String execLowercase(final UTF8String v) {
return CollationAwareUTF8String.toTitleCase(v);
@@ -319,17 +302,16 @@ public static UTF8String execICU(final UTF8String v, final int collationId) {
public static class FindInSet {
public static int exec(final UTF8String word, final UTF8String set, final int collationId) {
- CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
+ // FindInSet does space trimming collation as comparison is space trimming collation aware
+ if (collationId == CollationFactory.UTF8_BINARY_COLLATION_ID) {
return execBinary(word, set);
} else {
return execCollationAware(word, set, collationId);
}
}
public static String genCode(final String word, final String set, final int collationId) {
- CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.FindInSet.exec";
- if (collation.supportsBinaryEquality) {
+ if (collationId == CollationFactory.UTF8_BINARY_COLLATION_ID) {
return String.format(expr + "Binary(%s, %s)", word, set);
} else {
return String.format(expr + "CollationAware(%s, %s, %d)", word, set, collationId);
@@ -345,12 +327,15 @@ public static int execCollationAware(final UTF8String word, final UTF8String set
}
public static class StringInstr {
- public static int exec(final UTF8String string, final UTF8String substring,
+ public static int exec(final UTF8String string, UTF8String substring,
final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
+ if (collation.supportsSpaceTrimming) {
+ substring = CollationFactory.applyTrimmingPolicy(substring, collationId);
+ }
+ if (collation.isUtf8BinaryType) {
return execBinary(string, substring);
- } else if (collation.supportsLowercaseEquality) {
+ } else if (collation.isUtf8LcaseType) {
return execLowercase(string, substring);
} else {
return execICU(string, substring, collationId);
@@ -358,14 +343,11 @@ public static int exec(final UTF8String string, final UTF8String substring,
}
public static String genCode(final String string, final String substring,
final int collationId) {
- CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.StringInstr.exec";
- if (collation.supportsBinaryEquality) {
+ if (collationId == CollationFactory.UTF8_BINARY_COLLATION_ID) {
return String.format(expr + "Binary(%s, %s)", string, substring);
- } else if (collation.supportsLowercaseEquality) {
- return String.format(expr + "Lowercase(%s, %s)", string, substring);
} else {
- return String.format(expr + "ICU(%s, %s, %d)", string, substring, collationId);
+ return String.format(expr + "(%s, %s, %d)", string, substring, collationId);
}
}
public static int execBinary(final UTF8String string, final UTF8String substring) {
@@ -384,9 +366,10 @@ public static class StringReplace {
public static UTF8String exec(final UTF8String src, final UTF8String search,
final UTF8String replace, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
+ // Space trimming does not affect the output of this expression.
+ if (collation.isUtf8BinaryType) {
return execBinary(src, search, replace);
- } else if (collation.supportsLowercaseEquality) {
+ } else if (collation.isUtf8LcaseType) {
return execLowercase(src, search, replace);
} else {
return execICU(src, search, replace, collationId);
@@ -396,9 +379,9 @@ public static String genCode(final String src, final String search, final String
final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.StringReplace.exec";
- if (collation.supportsBinaryEquality) {
+ if (collation.isUtf8BinaryType) {
return String.format(expr + "Binary(%s, %s, %s)", src, search, replace);
- } else if (collation.supportsLowercaseEquality) {
+ } else if (collation.isUtf8LcaseType) {
return String.format(expr + "Lowercase(%s, %s, %s)", src, search, replace);
} else {
return String.format(expr + "ICU(%s, %s, %s, %d)", src, search, replace, collationId);
@@ -419,12 +402,15 @@ public static UTF8String execICU(final UTF8String src, final UTF8String search,
}
public static class StringLocate {
- public static int exec(final UTF8String string, final UTF8String substring, final int start,
+ public static int exec(final UTF8String string, UTF8String substring, final int start,
final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
+ if (collation.supportsSpaceTrimming) {
+ substring = CollationFactory.applyTrimmingPolicy(substring, collationId);
+ }
+ if (collation.isUtf8BinaryType) {
return execBinary(string, substring, start);
- } else if (collation.supportsLowercaseEquality) {
+ } else if (collation.isUtf8LcaseType) {
return execLowercase(string, substring, start);
} else {
return execICU(string, substring, start, collationId);
@@ -432,14 +418,11 @@ public static int exec(final UTF8String string, final UTF8String substring, fina
}
public static String genCode(final String string, final String substring, final int start,
final int collationId) {
- CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.StringLocate.exec";
- if (collation.supportsBinaryEquality) {
+ if (collationId == CollationFactory.UTF8_BINARY_COLLATION_ID) {
return String.format(expr + "Binary(%s, %s, %d)", string, substring, start);
- } else if (collation.supportsLowercaseEquality) {
- return String.format(expr + "Lowercase(%s, %s, %d)", string, substring, start);
} else {
- return String.format(expr + "ICU(%s, %s, %d, %d)", string, substring, start, collationId);
+ return String.format(expr + "(%s, %s, %d, %d)", string, substring, start, collationId);
}
}
public static int execBinary(final UTF8String string, final UTF8String substring,
@@ -457,27 +440,27 @@ public static int execICU(final UTF8String string, final UTF8String substring, f
}
public static class SubstringIndex {
- public static UTF8String exec(final UTF8String string, final UTF8String delimiter,
+ public static UTF8String exec(final UTF8String string, UTF8String delimiter,
final int count, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
+ if (collation.supportsSpaceTrimming) {
+ delimiter = CollationFactory.applyTrimmingPolicy(delimiter, collationId);
+ }
+ if (collation.isUtf8BinaryType) {
return execBinary(string, delimiter, count);
- } else if (collation.supportsLowercaseEquality) {
+ } else if (collation.isUtf8LcaseType) {
return execLowercase(string, delimiter, count);
} else {
return execICU(string, delimiter, count, collationId);
}
}
public static String genCode(final String string, final String delimiter,
- final int count, final int collationId) {
- CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
+ final String count, final int collationId) {
String expr = "CollationSupport.SubstringIndex.exec";
- if (collation.supportsBinaryEquality) {
- return String.format(expr + "Binary(%s, %s, %d)", string, delimiter, count);
- } else if (collation.supportsLowercaseEquality) {
- return String.format(expr + "Lowercase(%s, %s, %d)", string, delimiter, count);
+ if (collationId == CollationFactory.UTF8_BINARY_COLLATION_ID) {
+ return String.format(expr + "Binary(%s, %s, %s)", string, delimiter, count);
} else {
- return String.format(expr + "ICU(%s, %s, %d, %d)", string, delimiter, count, collationId);
+ return String.format(expr + "(%s, %s, %s, %d)", string, delimiter, count, collationId);
}
}
public static UTF8String execBinary(final UTF8String string, final UTF8String delimiter,
@@ -490,8 +473,7 @@ public static UTF8String execLowercase(final UTF8String string, final UTF8String
}
public static UTF8String execICU(final UTF8String string, final UTF8String delimiter,
final int count, final int collationId) {
- return CollationAwareUTF8String.subStringIndex(string, delimiter, count,
- collationId);
+ return CollationAwareUTF8String.subStringIndex(string, delimiter, count, collationId);
}
}
@@ -499,25 +481,15 @@ public static class StringTranslate {
public static UTF8String exec(final UTF8String source, Map dict,
final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
+ // Space trimming does not affect the output of this expression.
+ if (collation.isUtf8BinaryType) {
return execBinary(source, dict);
- } else if (collation.supportsLowercaseEquality) {
+ } else if (collation.isUtf8LcaseType) {
return execLowercase(source, dict);
} else {
return execICU(source, dict, collationId);
}
}
- public static String genCode(final String source, final String dict, final int collationId) {
- CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
- String expr = "CollationSupport.EndsWith.exec";
- if (collation.supportsBinaryEquality) {
- return String.format(expr + "Binary(%s, %s)", source, dict);
- } else if (collation.supportsLowercaseEquality) {
- return String.format(expr + "Lowercase(%s, %s)", source, dict);
- } else {
- return String.format(expr + "ICU(%s, %s, %d)", source, dict, collationId);
- }
- }
public static UTF8String execBinary(final UTF8String source, Map dict) {
return source.translate(dict);
}
@@ -539,10 +511,15 @@ public static UTF8String exec(
final UTF8String trimString,
final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
+ if (collation.isUtf8BinaryType && !collation.supportsSpaceTrimming) {
return execBinary(srcString, trimString);
- } else if (collation.supportsLowercaseEquality) {
- return execLowercase(srcString, trimString);
+ }
+
+ if (collation.isUtf8BinaryType) {
+ // special handling needed for utf8_binary_rtrim collation.
+ return execBinaryTrim(srcString, trimString, collationId);
+ } else if (collation.isUtf8LcaseType) {
+ return execLowercase(srcString, trimString, collationId);
} else {
return execICU(srcString, trimString, collationId);
}
@@ -554,14 +531,11 @@ public static String genCode(
final String srcString,
final String trimString,
final int collationId) {
- CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.StringTrim.exec";
- if (collation.supportsBinaryEquality) {
+ if (collationId == CollationFactory.UTF8_BINARY_COLLATION_ID) {
return String.format(expr + "Binary(%s, %s)", srcString, trimString);
- } else if (collation.supportsLowercaseEquality) {
- return String.format(expr + "Lowercase(%s, %s)", srcString, trimString);
} else {
- return String.format(expr + "ICU(%s, %s, %d)", srcString, trimString, collationId);
+ return String.format(expr + "(%s, %s, %d)", srcString, trimString, collationId);
}
}
public static UTF8String execBinary(
@@ -575,8 +549,9 @@ public static UTF8String execBinary(
}
public static UTF8String execLowercase(
final UTF8String srcString,
- final UTF8String trimString) {
- return CollationAwareUTF8String.lowercaseTrim(srcString, trimString);
+ final UTF8String trimString,
+ final int collationId) {
+ return CollationAwareUTF8String.lowercaseTrim(srcString, trimString, collationId);
}
public static UTF8String execICU(
final UTF8String srcString,
@@ -584,6 +559,12 @@ public static UTF8String execICU(
final int collationId) {
return CollationAwareUTF8String.trim(srcString, trimString, collationId);
}
+ public static UTF8String execBinaryTrim(
+ final UTF8String srcString,
+ final UTF8String trimString,
+ final int collationId) {
+ return CollationAwareUTF8String.binaryTrim(srcString, trimString, collationId);
+ }
}
public static class StringTrimLeft {
@@ -591,13 +572,15 @@ public static UTF8String exec(final UTF8String srcString) {
return execBinary(srcString);
}
public static UTF8String exec(
- final UTF8String srcString,
- final UTF8String trimString,
- final int collationId) {
+ final UTF8String srcString,
+ UTF8String trimString,
+ final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
+ // Space trimming does not affect the output of this expression as currently only supported
+ // space trimming is RTRIM.
+ if (collation.isUtf8BinaryType) {
return execBinary(srcString, trimString);
- } else if (collation.supportsLowercaseEquality) {
+ } else if (collation.isUtf8LcaseType) {
return execLowercase(srcString, trimString);
} else {
return execICU(srcString, trimString, collationId);
@@ -610,14 +593,11 @@ public static String genCode(
final String srcString,
final String trimString,
final int collationId) {
- CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.StringTrimLeft.exec";
- if (collation.supportsBinaryEquality) {
+ if (collationId == CollationFactory.UTF8_BINARY_COLLATION_ID) {
return String.format(expr + "Binary(%s, %s)", srcString, trimString);
- } else if (collation.supportsLowercaseEquality) {
- return String.format(expr + "Lowercase(%s, %s)", srcString, trimString);
} else {
- return String.format(expr + "ICU(%s, %s, %d)", srcString, trimString, collationId);
+ return String.format(expr + "(%s, %s, %d)", srcString, trimString, collationId);
}
}
public static UTF8String execBinary(final UTF8String srcString) {
@@ -650,10 +630,15 @@ public static UTF8String exec(
final UTF8String trimString,
final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
+ if (collation.isUtf8BinaryType && !collation.supportsSpaceTrimming) {
return execBinary(srcString, trimString);
- } else if (collation.supportsLowercaseEquality) {
- return execLowercase(srcString, trimString);
+ }
+
+ if (collation.isUtf8BinaryType) {
+ // special handling needed for utf8_binary_rtrim collation.
+ return execBinaryTrim(srcString, trimString, collationId);
+ } else if (collation.isUtf8LcaseType) {
+ return execLowercase(srcString, trimString, collationId);
} else {
return execICU(srcString, trimString, collationId);
}
@@ -665,14 +650,11 @@ public static String genCode(
final String srcString,
final String trimString,
final int collationId) {
- CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.StringTrimRight.exec";
- if (collation.supportsBinaryEquality) {
+ if (collationId == CollationFactory.UTF8_BINARY_COLLATION_ID) {
return String.format(expr + "Binary(%s, %s)", srcString, trimString);
- } else if (collation.supportsLowercaseEquality) {
- return String.format(expr + "Lowercase(%s, %s)", srcString, trimString);
} else {
- return String.format(expr + "ICU(%s, %s, %d)", srcString, trimString, collationId);
+ return String.format(expr + "(%s, %s, %d)", srcString, trimString, collationId);
}
}
public static UTF8String execBinary(final UTF8String srcString) {
@@ -685,8 +667,9 @@ public static UTF8String execBinary(
}
public static UTF8String execLowercase(
final UTF8String srcString,
- final UTF8String trimString) {
- return CollationAwareUTF8String.lowercaseTrimRight(srcString, trimString);
+ final UTF8String trimString,
+ final int collationId) {
+ return CollationAwareUTF8String.lowercaseTrimRight(srcString, trimString, collationId);
}
public static UTF8String execICU(
final UTF8String srcString,
@@ -694,6 +677,12 @@ public static UTF8String execICU(
final int collationId) {
return CollationAwareUTF8String.trimRight(srcString, trimString, collationId);
}
+ public static UTF8String execBinaryTrim(
+ final UTF8String srcString,
+ final UTF8String trimString,
+ final int collationId) {
+ return CollationAwareUTF8String.binaryTrimRight(srcString, trimString, collationId);
+ }
}
// TODO: Add more collation-aware string expressions.
@@ -705,10 +694,10 @@ public static UTF8String execICU(
public static boolean supportsLowercaseRegex(final int collationId) {
// for regex, only Unicode case-insensitive matching is possible,
// so UTF8_LCASE is treated as UNICODE_CI in this context
- return CollationFactory.fetchCollation(collationId).supportsLowercaseEquality;
+ return CollationFactory.fetchCollation(collationId).isUtf8LcaseType;
}
- private static final int lowercaseRegexFlags = Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE;
+ static final int lowercaseRegexFlags = Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE;
public static int collationAwareRegexFlags(final int collationId) {
return supportsLowercaseRegex(collationId) ? lowercaseRegexFlags : 0;
}
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/SpecialCodePointConstants.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/SpecialCodePointConstants.java
new file mode 100644
index 0000000000000..db615d747910b
--- /dev/null
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/SpecialCodePointConstants.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util;
+
+/**
+ * 'SpecialCodePointConstants' is introduced in order to keep the codepoints used in
+ * 'CollationAwareUTF8String' in one place.
+ */
+public class SpecialCodePointConstants {
+
+ public static final int COMBINING_DOT = 0x0307;
+ public static final int ASCII_SMALL_I = 0x0069;
+ public static final int ASCII_SPACE = 0x0020;
+ public static final int GREEK_CAPITAL_SIGMA = 0x03A3;
+ public static final int GREEK_SMALL_SIGMA = 0x03C3;
+ public static final int GREEK_FINAL_SIGMA = 0x03C2;
+ public static final int CAPITAL_I_WITH_DOT_ABOVE = 0x0130;
+}
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/UTF8StringBuilder.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/UTF8StringBuilder.java
index 481ea89090b2a..3a697345ce1f5 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/UTF8StringBuilder.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/UTF8StringBuilder.java
@@ -96,4 +96,33 @@ public void appendBytes(Object base, long offset, int length) {
public UTF8String build() {
return UTF8String.fromBytes(buffer, 0, totalSize());
}
+
+ public void appendCodePoint(int codePoint) {
+ if (codePoint <= 0x7F) {
+ grow(1);
+ buffer[cursor - Platform.BYTE_ARRAY_OFFSET] = (byte) codePoint;
+ ++cursor;
+ } else if (codePoint <= 0x7FF) {
+ grow(2);
+ buffer[cursor - Platform.BYTE_ARRAY_OFFSET] = (byte) (0xC0 | (codePoint >> 6));
+ buffer[cursor + 1 - Platform.BYTE_ARRAY_OFFSET] = (byte) (0x80 | (codePoint & 0x3F));
+ cursor += 2;
+ } else if (codePoint <= 0xFFFF) {
+ grow(3);
+ buffer[cursor - Platform.BYTE_ARRAY_OFFSET] = (byte) (0xE0 | (codePoint >> 12));
+ buffer[cursor + 1 - Platform.BYTE_ARRAY_OFFSET] = (byte) (0x80 | ((codePoint >> 6) & 0x3F));
+ buffer[cursor + 2 - Platform.BYTE_ARRAY_OFFSET] = (byte) (0x80 | (codePoint & 0x3F));
+ cursor += 3;
+ } else if (codePoint <= 0x10FFFF) {
+ grow(4);
+ buffer[cursor - Platform.BYTE_ARRAY_OFFSET] = (byte) (0xF0 | (codePoint >> 18));
+ buffer[cursor + 1 - Platform.BYTE_ARRAY_OFFSET] = (byte) (0x80 | ((codePoint >> 12) & 0x3F));
+ buffer[cursor + 2 - Platform.BYTE_ARRAY_OFFSET] = (byte) (0x80 | ((codePoint >> 6) & 0x3F));
+ buffer[cursor + 3 - Platform.BYTE_ARRAY_OFFSET] = (byte) (0x80 | (codePoint & 0x3F));
+ cursor += 4;
+ } else {
+ throw new IllegalArgumentException("Invalid Unicode codePoint: " + codePoint);
+ }
+ }
+
}
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index e6bddb12da56b..caf8461b0b5d6 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -83,6 +83,20 @@ private enum UTF8StringValidity {
*/
private volatile int numBytesValid = -1;
+ /**
+ * The ASCII-ness of the UTF8Strings can be cached to avoid repeated checks, because that
+ * operation requires full string scan. Full ASCII strings contain only ASCII characters.
+ */
+ private enum IsFullAscii {
+ UNKNOWN, FULL_ASCII, NOT_ASCII
+ }
+
+ /**
+ * Internal flag to indicate whether the string is full ASCII or not. Initially, the ASCII-ness
+ * is UNKNOWN, and will be set to either FULL_ASCII or NOT_ASCII after the first check.
+ */
+ private volatile IsFullAscii isFullAscii = IsFullAscii.UNKNOWN;
+
public Object getBaseObject() { return base; }
public long getBaseOffset() { return offset; }
@@ -127,6 +141,7 @@ private enum UTF8StringValidity {
private static final UTF8String COMMA_UTF8 = UTF8String.fromString(",");
public static final UTF8String EMPTY_UTF8 = UTF8String.fromString("");
public static final UTF8String ZERO_UTF8 = UTF8String.fromString("0");
+ public static final UTF8String SPACE_UTF8 = UTF8String.fromString(" ");
/**
@@ -788,12 +803,19 @@ public UTF8String toLowerCase() {
}
public boolean isFullAscii() {
+ if (isFullAscii == IsFullAscii.UNKNOWN) {
+ isFullAscii = getIsFullAscii();
+ }
+ return isFullAscii == IsFullAscii.FULL_ASCII;
+ }
+
+ private IsFullAscii getIsFullAscii() {
for (var i = 0; i < numBytes; i++) {
if (getByte(i) < 0) {
- return false;
+ return IsFullAscii.NOT_ASCII;
}
}
- return true;
+ return IsFullAscii.FULL_ASCII;
}
private UTF8String toLowerCaseSlow() {
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index b082ab21944f7..a696da8cf45b8 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -40,165 +40,224 @@ public class CollationSupportSuite {
{"UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI"};
/**
- * Collation-aware UTF8String comparison.
+ * Collation-aware UTF8String comparison and equality check.
*/
- private void assertStringCompare(String s1, String s2, String collationName, int expected)
+ private void assertCompare(String s1, String s2, String collationName, int expected)
throws SparkException {
UTF8String l = UTF8String.fromString(s1);
UTF8String r = UTF8String.fromString(s2);
+ // Test the comparator, which is the most general way to compare strings with collations.
int compare = CollationFactory.fetchCollation(collationName).comparator.compare(l, r);
assertEquals(Integer.signum(expected), Integer.signum(compare));
+ // Test the equals function, which may be faster than the comparator for equality checks.
+ boolean equals = CollationFactory.fetchCollation(collationName).equalsFunction.apply(l ,r);
+ assertEquals(expected == 0, equals);
}
@Test
public void testCompare() throws SparkException {
for (String collationName: testSupportedCollations) {
- // Edge cases
- assertStringCompare("", "", collationName, 0);
- assertStringCompare("a", "", collationName, 1);
- assertStringCompare("", "a", collationName, -1);
- // Basic tests
- assertStringCompare("a", "a", collationName, 0);
- assertStringCompare("a", "b", collationName, -1);
- assertStringCompare("b", "a", collationName, 1);
- assertStringCompare("A", "A", collationName, 0);
- assertStringCompare("A", "B", collationName, -1);
- assertStringCompare("B", "A", collationName, 1);
- assertStringCompare("aa", "a", collationName, 1);
- assertStringCompare("b", "bb", collationName, -1);
- assertStringCompare("abc", "a", collationName, 1);
- assertStringCompare("abc", "b", collationName, -1);
- assertStringCompare("abc", "ab", collationName, 1);
- assertStringCompare("abc", "abc", collationName, 0);
- // ASCII strings
- assertStringCompare("aaaa", "aaa", collationName, 1);
- assertStringCompare("hello", "world", collationName, -1);
- assertStringCompare("Spark", "Spark", collationName, 0);
- // Non-ASCII strings
- assertStringCompare("ü", "ü", collationName, 0);
- assertStringCompare("ü", "", collationName, 1);
- assertStringCompare("", "ü", collationName, -1);
- assertStringCompare("äü", "äü", collationName, 0);
- assertStringCompare("äxx", "äx", collationName, 1);
- assertStringCompare("a", "ä", collationName, -1);
+ // Empty strings.
+ assertCompare("", "", collationName, 0);
+ assertCompare("a", "", collationName, 1);
+ assertCompare("", "a", collationName, -1);
+ // Basic tests.
+ assertCompare("a", "a", collationName, 0);
+ assertCompare("a", "b", collationName, -1);
+ assertCompare("b", "a", collationName, 1);
+ assertCompare("A", "A", collationName, 0);
+ assertCompare("A", "B", collationName, -1);
+ assertCompare("B", "A", collationName, 1);
+ assertCompare("aa", "a", collationName, 1);
+ assertCompare("b", "bb", collationName, -1);
+ assertCompare("abc", "a", collationName, 1);
+ assertCompare("abc", "b", collationName, -1);
+ assertCompare("abc", "ab", collationName, 1);
+ assertCompare("abc", "abc", collationName, 0);
+ assertCompare("aaaa", "aaa", collationName, 1);
+ assertCompare("hello", "world", collationName, -1);
+ assertCompare("Spark", "Spark", collationName, 0);
+ assertCompare("ü", "ü", collationName, 0);
+ assertCompare("ü", "", collationName, 1);
+ assertCompare("", "ü", collationName, -1);
+ assertCompare("äü", "äü", collationName, 0);
+ assertCompare("äxx", "äx", collationName, 1);
+ assertCompare("a", "ä", collationName, -1);
}
- // Non-ASCII strings
- assertStringCompare("äü", "bü", "UTF8_BINARY", 1);
- assertStringCompare("bxx", "bü", "UTF8_BINARY", -1);
- assertStringCompare("äü", "bü", "UTF8_LCASE", 1);
- assertStringCompare("bxx", "bü", "UTF8_LCASE", -1);
- assertStringCompare("äü", "bü", "UNICODE", -1);
- assertStringCompare("bxx", "bü", "UNICODE", 1);
- assertStringCompare("äü", "bü", "UNICODE_CI", -1);
- assertStringCompare("bxx", "bü", "UNICODE_CI", 1);
- // Case variation
- assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1);
- assertStringCompare("ABCD", "abcd", "UTF8_LCASE", 0);
- assertStringCompare("AbcD", "aBCd", "UNICODE", 1);
- assertStringCompare("abcd", "ABCD", "UNICODE_CI", 0);
- // Accent variation
- assertStringCompare("aBćD", "ABĆD", "UTF8_BINARY", 1);
- assertStringCompare("AbCδ", "ABCΔ", "UTF8_LCASE", 0);
- assertStringCompare("äBCd", "ÄBCD", "UNICODE", -1);
- assertStringCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0);
- // Case-variable character length
- assertStringCompare("i\u0307", "İ", "UTF8_BINARY", -1);
- assertStringCompare("İ", "i\u0307", "UTF8_BINARY", 1);
- assertStringCompare("i\u0307", "İ", "UTF8_LCASE", 0);
- assertStringCompare("İ", "i\u0307", "UTF8_LCASE", 0);
- assertStringCompare("i\u0307", "İ", "UNICODE", -1);
- assertStringCompare("İ", "i\u0307", "UNICODE", 1);
- assertStringCompare("i\u0307", "İ", "UNICODE_CI", 0);
- assertStringCompare("İ", "i\u0307", "UNICODE_CI", 0);
- assertStringCompare("i\u0307İ", "i\u0307İ", "UTF8_LCASE", 0);
- assertStringCompare("i\u0307İ", "İi\u0307", "UTF8_LCASE", 0);
- assertStringCompare("İi\u0307", "i\u0307İ", "UTF8_LCASE", 0);
- assertStringCompare("İi\u0307", "İi\u0307", "UTF8_LCASE", 0);
- assertStringCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0);
- assertStringCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0);
- assertStringCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0);
- assertStringCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0);
- // Conditional case mapping
- assertStringCompare("ς", "σ", "UTF8_BINARY", -1);
- assertStringCompare("ς", "Σ", "UTF8_BINARY", 1);
- assertStringCompare("σ", "Σ", "UTF8_BINARY", 1);
- assertStringCompare("ς", "σ", "UTF8_LCASE", 0);
- assertStringCompare("ς", "Σ", "UTF8_LCASE", 0);
- assertStringCompare("σ", "Σ", "UTF8_LCASE", 0);
- assertStringCompare("ς", "σ", "UNICODE", 1);
- assertStringCompare("ς", "Σ", "UNICODE", 1);
- assertStringCompare("σ", "Σ", "UNICODE", -1);
- assertStringCompare("ς", "σ", "UNICODE_CI", 0);
- assertStringCompare("ς", "Σ", "UNICODE_CI", 0);
- assertStringCompare("σ", "Σ", "UNICODE_CI", 0);
+ // Advanced tests.
+ assertCompare("äü", "bü", "UTF8_BINARY", 1);
+ assertCompare("bxx", "bü", "UTF8_BINARY", -1);
+ assertCompare("äü", "bü", "UTF8_LCASE", 1);
+ assertCompare("bxx", "bü", "UTF8_LCASE", -1);
+ assertCompare("äü", "bü", "UNICODE", -1);
+ assertCompare("bxx", "bü", "UNICODE", 1);
+ assertCompare("äü", "bü", "UNICODE_CI", -1);
+ assertCompare("bxx", "bü", "UNICODE_CI", 1);
+ assertCompare("cČć", "ČćC", "SR_CI_AI", 0);
+ // Case variation.
+ assertCompare("AbCd", "aBcD", "UTF8_BINARY", -1);
+ assertCompare("ABCD", "abcd", "UTF8_LCASE", 0);
+ assertCompare("AbcD", "aBCd", "UNICODE", 1);
+ assertCompare("abcd", "ABCD", "UNICODE_CI", 0);
+ // Accent variation.
+ assertCompare("aBćD", "ABĆD", "UTF8_BINARY", 1);
+ assertCompare("AbCδ", "ABCΔ", "UTF8_LCASE", 0);
+ assertCompare("äBCd", "ÄBCD", "UNICODE", -1);
+ assertCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0);
+ assertCompare("ÈÉÊË", "EeEe", "AF_CI_AI", 0);
+ // One-to-many case mapping (e.g. Turkish dotted I).
+ assertCompare("i\u0307", "İ", "UTF8_BINARY", -1);
+ assertCompare("İ", "i\u0307", "UTF8_BINARY", 1);
+ assertCompare("i\u0307", "İ", "UTF8_LCASE", 0);
+ assertCompare("İ", "i\u0307", "UTF8_LCASE", 0);
+ assertCompare("i\u0307", "İ", "UNICODE", -1);
+ assertCompare("İ", "i\u0307", "UNICODE", 1);
+ assertCompare("i\u0307", "İ", "UNICODE_CI", 0);
+ assertCompare("İ", "i\u0307", "UNICODE_CI", 0);
+ assertCompare("i\u0307İ", "i\u0307İ", "UTF8_LCASE", 0);
+ assertCompare("i\u0307İ", "İi\u0307", "UTF8_LCASE", 0);
+ assertCompare("İi\u0307", "i\u0307İ", "UTF8_LCASE", 0);
+ assertCompare("İi\u0307", "İi\u0307", "UTF8_LCASE", 0);
+ assertCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0);
+ assertCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0);
+ assertCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0);
+ assertCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0);
+ // Conditional case mapping (e.g. Greek sigmas).
+ assertCompare("ς", "σ", "UTF8_BINARY", -1);
+ assertCompare("ς", "Σ", "UTF8_BINARY", 1);
+ assertCompare("σ", "Σ", "UTF8_BINARY", 1);
+ assertCompare("ς", "σ", "UTF8_LCASE", 0);
+ assertCompare("ς", "Σ", "UTF8_LCASE", 0);
+ assertCompare("σ", "Σ", "UTF8_LCASE", 0);
+ assertCompare("ς", "σ", "UNICODE", 1);
+ assertCompare("ς", "Σ", "UNICODE", 1);
+ assertCompare("σ", "Σ", "UNICODE", -1);
+ assertCompare("ς", "σ", "UNICODE_CI", 0);
+ assertCompare("ς", "Σ", "UNICODE_CI", 0);
+ assertCompare("σ", "Σ", "UNICODE_CI", 0);
+ // Surrogate pairs.
+ assertCompare("a🙃b🙃c", "aaaaa", "UTF8_BINARY", 1);
+ assertCompare("a🙃b🙃c", "aaaaa", "UTF8_LCASE", 1);
+ assertCompare("a🙃b🙃c", "aaaaa", "UNICODE", -1); // != UTF8_BINARY
+ assertCompare("a🙃b🙃c", "aaaaa", "UNICODE_CI", -1); // != UTF8_LCASE
+ assertCompare("a🙃b🙃c", "a🙃b🙃c", "UTF8_BINARY", 0);
+ assertCompare("a🙃b🙃c", "a🙃b🙃c", "UTF8_LCASE", 0);
+ assertCompare("a🙃b🙃c", "a🙃b🙃c", "UNICODE", 0);
+ assertCompare("a🙃b🙃c", "a🙃b🙃c", "UNICODE_CI", 0);
+ assertCompare("a🙃b🙃c", "a🙃b🙃d", "UTF8_BINARY", -1);
+ assertCompare("a🙃b🙃c", "a🙃b🙃d", "UTF8_LCASE", -1);
+ assertCompare("a🙃b🙃c", "a🙃b🙃d", "UNICODE", -1);
+ assertCompare("a🙃b🙃c", "a🙃b🙃d", "UNICODE_CI", -1);
// Maximum code point.
int maxCodePoint = Character.MAX_CODE_POINT;
String maxCodePointStr = new String(Character.toChars(maxCodePoint));
for (int i = 0; i < maxCodePoint && Character.isValidCodePoint(i); ++i) {
- assertStringCompare(new String(Character.toChars(i)), maxCodePointStr, "UTF8_BINARY", -1);
- assertStringCompare(new String(Character.toChars(i)), maxCodePointStr, "UTF8_LCASE", -1);
+ assertCompare(new String(Character.toChars(i)), maxCodePointStr, "UTF8_BINARY", -1);
+ assertCompare(new String(Character.toChars(i)), maxCodePointStr, "UTF8_LCASE", -1);
}
// Minimum code point.
int minCodePoint = Character.MIN_CODE_POINT;
String minCodePointStr = new String(Character.toChars(minCodePoint));
for (int i = minCodePoint + 1; i <= maxCodePoint && Character.isValidCodePoint(i); ++i) {
- assertStringCompare(new String(Character.toChars(i)), minCodePointStr, "UTF8_BINARY", 1);
- assertStringCompare(new String(Character.toChars(i)), minCodePointStr, "UTF8_LCASE", 1);
+ assertCompare(new String(Character.toChars(i)), minCodePointStr, "UTF8_BINARY", 1);
+ assertCompare(new String(Character.toChars(i)), minCodePointStr, "UTF8_LCASE", 1);
}
}
- private void assertLowerCaseCodePoints(UTF8String target, UTF8String expected,
- Boolean useCodePoints) {
+ /**
+ * Collation-aware UTF8String lowercase conversion.
+ */
+
+ private void assertLowerCaseCodePoints(String string, String expected, Boolean useCodePoints) {
+ UTF8String str = UTF8String.fromString(string);
if (useCodePoints) {
- assertEquals(expected, CollationAwareUTF8String.lowerCaseCodePoints(target));
+ UTF8String result = CollationAwareUTF8String.lowerCaseCodePoints(str);
+ assertEquals(UTF8String.fromString(expected), result);
} else {
- assertEquals(expected, target.toLowerCase());
+ UTF8String result = str.toLowerCase();
+ assertEquals(UTF8String.fromString(expected), result);
}
}
@Test
public void testLowerCaseCodePoints() {
- // Edge cases
- assertLowerCaseCodePoints(UTF8String.fromString(""), UTF8String.fromString(""), false);
- assertLowerCaseCodePoints(UTF8String.fromString(""), UTF8String.fromString(""), true);
- // Basic tests
- assertLowerCaseCodePoints(UTF8String.fromString("abcd"), UTF8String.fromString("abcd"), false);
- assertLowerCaseCodePoints(UTF8String.fromString("AbCd"), UTF8String.fromString("abcd"), false);
- assertLowerCaseCodePoints(UTF8String.fromString("abcd"), UTF8String.fromString("abcd"), true);
- assertLowerCaseCodePoints(UTF8String.fromString("aBcD"), UTF8String.fromString("abcd"), true);
- // Accent variation
- assertLowerCaseCodePoints(UTF8String.fromString("AbĆd"), UTF8String.fromString("abćd"), false);
- assertLowerCaseCodePoints(UTF8String.fromString("aBcΔ"), UTF8String.fromString("abcδ"), true);
- // Case-variable character length
- assertLowerCaseCodePoints(
- UTF8String.fromString("İoDiNe"), UTF8String.fromString("i̇odine"), false);
- assertLowerCaseCodePoints(
- UTF8String.fromString("Abi̇o12"), UTF8String.fromString("abi̇o12"), false);
- assertLowerCaseCodePoints(
- UTF8String.fromString("İodInE"), UTF8String.fromString("i̇odine"), true);
- assertLowerCaseCodePoints(
- UTF8String.fromString("aBi̇o12"), UTF8String.fromString("abi̇o12"), true);
- // Conditional case mapping
- assertLowerCaseCodePoints(
- UTF8String.fromString("ΘΑΛΑΣΣΙΝΟΣ"), UTF8String.fromString("θαλασσινος"), false);
- assertLowerCaseCodePoints(
- UTF8String.fromString("ΘΑΛΑΣΣΙΝΟΣ"), UTF8String.fromString("θαλασσινοσ"), true);
- // Surrogate pairs are treated as invalid UTF8 sequences
- assertLowerCaseCodePoints(UTF8String.fromBytes(new byte[]
- {(byte) 0xED, (byte) 0xA0, (byte) 0x80, (byte) 0xED, (byte) 0xB0, (byte) 0x80}),
- UTF8String.fromString("\uFFFD\uFFFD"), false);
- assertLowerCaseCodePoints(UTF8String.fromBytes(new byte[]
- {(byte) 0xED, (byte) 0xA0, (byte) 0x80, (byte) 0xED, (byte) 0xB0, (byte) 0x80}),
- UTF8String.fromString("\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD"), true); // != Java toLowerCase
+ // Empty strings.
+ assertLowerCaseCodePoints("", "", false);
+ assertLowerCaseCodePoints("", "", true);
+ // Basic tests.
+ assertLowerCaseCodePoints("xyz", "xyz", false);
+ assertLowerCaseCodePoints("xyz", "xyz", true);
+ assertLowerCaseCodePoints("abcd", "abcd", false);
+ assertLowerCaseCodePoints("abcd", "abcd", true);
+ // Advanced tests.
+ assertLowerCaseCodePoints("你好", "你好", false);
+ assertLowerCaseCodePoints("你好", "你好", true);
+ assertLowerCaseCodePoints("Γειά", "γειά", false);
+ assertLowerCaseCodePoints("Γειά", "γειά", true);
+ assertLowerCaseCodePoints("Здраво", "здраво", false);
+ assertLowerCaseCodePoints("Здраво", "здраво", true);
+ // Case variation.
+ assertLowerCaseCodePoints("xYz", "xyz", false);
+ assertLowerCaseCodePoints("xYz", "xyz", true);
+ assertLowerCaseCodePoints("AbCd", "abcd", false);
+ assertLowerCaseCodePoints("aBcD", "abcd", true);
+ // Accent variation.
+ assertLowerCaseCodePoints("äbć", "äbć", false);
+ assertLowerCaseCodePoints("äbć", "äbć", true);
+ assertLowerCaseCodePoints("AbĆd", "abćd", false);
+ assertLowerCaseCodePoints("aBcΔ", "abcδ", true);
+ // One-to-many case mapping (e.g. Turkish dotted I).
+ assertLowerCaseCodePoints("i\u0307", "i\u0307", false);
+ assertLowerCaseCodePoints("i\u0307", "i\u0307", true);
+ assertLowerCaseCodePoints("I\u0307", "i\u0307", false);
+ assertLowerCaseCodePoints("I\u0307", "i\u0307", true);
+ assertLowerCaseCodePoints("İ", "i\u0307", false);
+ assertLowerCaseCodePoints("İ", "i\u0307", true);
+ assertLowerCaseCodePoints("İİİ", "i\u0307i\u0307i\u0307", false);
+ assertLowerCaseCodePoints("İİİ", "i\u0307i\u0307i\u0307", true);
+ assertLowerCaseCodePoints("İiIi\u0307", "i\u0307iii\u0307", false);
+ assertLowerCaseCodePoints("İiIi\u0307", "i\u0307iii\u0307", true);
+ assertLowerCaseCodePoints("İoDiNe", "i\u0307odine", false);
+ assertLowerCaseCodePoints("İodInE", "i\u0307odine", true);
+ assertLowerCaseCodePoints("Abi\u0307o12", "abi\u0307o12", false);
+ assertLowerCaseCodePoints("aBi\u0307o12", "abi\u0307o12", true);
+ // Conditional case mapping (e.g. Greek sigmas).
+ assertLowerCaseCodePoints("ς", "ς", false);
+ assertLowerCaseCodePoints("ς", "σ", true);
+ assertLowerCaseCodePoints("σ", "σ", false);
+ assertLowerCaseCodePoints("σ", "σ", true);
+ assertLowerCaseCodePoints("Σ", "σ", false);
+ assertLowerCaseCodePoints("Σ", "σ", true);
+ assertLowerCaseCodePoints("ςΑΛΑΤΑ", "ςαλατα", false);
+ assertLowerCaseCodePoints("ςΑΛΑΤΑ", "σαλατα", true);
+ assertLowerCaseCodePoints("σΑΛΑΤΑ", "σαλατα", false);
+ assertLowerCaseCodePoints("σΑΛΑΤΑ", "σαλατα", true);
+ assertLowerCaseCodePoints("ΣΑΛΑΤΑ", "σαλατα", false);
+ assertLowerCaseCodePoints("ΣΑΛΑΤΑ", "σαλατα", true);
+ assertLowerCaseCodePoints("ΘΑΛΑΣΣΙΝΟς", "θαλασσινος", false);
+ assertLowerCaseCodePoints("ΘΑΛΑΣΣΙΝΟς", "θαλασσινοσ", true);
+ assertLowerCaseCodePoints("ΘΑΛΑΣΣΙΝΟσ", "θαλασσινοσ", false);
+ assertLowerCaseCodePoints("ΘΑΛΑΣΣΙΝΟσ", "θαλασσινοσ", true);
+ assertLowerCaseCodePoints("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", false);
+ assertLowerCaseCodePoints("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινοσ", true);
+ // Surrogate pairs.
+ assertLowerCaseCodePoints("a🙃b🙃c", "a🙃b🙃c", false);
+ assertLowerCaseCodePoints("a🙃b🙃c", "a🙃b🙃c", true);
+ assertLowerCaseCodePoints("😀😆😃😄😄😆", "😀😆😃😄😄😆", false);
+ assertLowerCaseCodePoints("😀😆😃😄😄😆", "😀😆😃😄😄😆", true);
+ assertLowerCaseCodePoints("𐐅", "𐐭", false);
+ assertLowerCaseCodePoints("𐐅", "𐐭", true);
+ assertLowerCaseCodePoints("𝔸", "𝔸", false);
+ assertLowerCaseCodePoints("𝔸", "𝔸", true);
}
/**
- * Collation-aware string expressions.
+ * Verify the behaviour of the `Contains` collation support class.
*/
- private void assertContains(String pattern, String target, String collationName, boolean expected)
- throws SparkException {
+ private void assertContains(String pattern, String target, String collationName,
+ boolean expected) throws SparkException {
UTF8String l = UTF8String.fromString(pattern);
UTF8String r = UTF8String.fromString(target);
int collationId = CollationFactory.collationNameToId(collationName);
@@ -207,20 +266,42 @@ private void assertContains(String pattern, String target, String collationName,
@Test
public void testContains() throws SparkException {
- // Edge cases
- assertContains("", "", "UTF8_BINARY", true);
- assertContains("c", "", "UTF8_BINARY", true);
- assertContains("", "c", "UTF8_BINARY", false);
- assertContains("", "", "UNICODE", true);
- assertContains("c", "", "UNICODE", true);
- assertContains("", "c", "UNICODE", false);
- assertContains("", "", "UTF8_LCASE", true);
- assertContains("c", "", "UTF8_LCASE", true);
- assertContains("", "c", "UTF8_LCASE", false);
- assertContains("", "", "UNICODE_CI", true);
- assertContains("c", "", "UNICODE_CI", true);
- assertContains("", "c", "UNICODE_CI", false);
- // Basic tests
+ for (String collationName: testSupportedCollations) {
+ // Empty strings.
+ assertContains("", "", collationName, true);
+ assertContains("a", "", collationName, true);
+ assertContains("", "x", collationName, false);
+ // Basic tests.
+ assertContains("a", "a", collationName, true);
+ assertContains("_a_", "_a_", collationName, true);
+ assertContains("_a_", "a", collationName, true);
+ assertContains("%a%", "%a%", collationName, true);
+ assertContains("%a%", "a", collationName, true);
+ assertContains("*a*", "*a*", collationName, true);
+ assertContains("*a*", "a", collationName, true);
+ assertContains("?a?", "?a?", collationName, true);
+ assertContains("?a?", "a", collationName, true);
+ assertContains("/a/", "/a/", collationName, true);
+ assertContains("/a/", "a", collationName, true);
+ assertContains("abcde", "xyz", collationName, false);
+ assertContains("abcde", "bcd", collationName, true);
+ assertContains("abcde", "abc", collationName, true);
+ assertContains("abcde", "cde", collationName, true);
+ assertContains("abcde", "abcde", collationName, true);
+ assertContains("你好", "x", collationName, false);
+ assertContains("你好", "你", collationName, true);
+ assertContains("你好", "好", collationName, true);
+ assertContains("你好", "你好", collationName, true);
+ assertContains("Γειά", "x", collationName, false);
+ assertContains("Γειά", "ειά", collationName, true);
+ assertContains("Γειά", "Γει", collationName, true);
+ assertContains("Γειά", "Γειά", collationName, true);
+ assertContains("Здраво", "x", collationName, false);
+ assertContains("Здраво", "драво", collationName, true);
+ assertContains("Здраво", "Здрав", collationName, true);
+ assertContains("Здраво", "Здраво", collationName, true);
+ }
+ // Advanced tests.
assertContains("abcde", "bcd", "UTF8_BINARY", true);
assertContains("abcde", "bde", "UTF8_BINARY", false);
assertContains("abcde", "fgh", "UTF8_BINARY", false);
@@ -233,25 +314,6 @@ public void testContains() throws SparkException {
assertContains("abcde", "c", "UNICODE_CI", true);
assertContains("abcde", "bCD", "UNICODE_CI", true);
assertContains("abcde", "123", "UNICODE_CI", false);
- // Case variation
- assertContains("aBcDe", "bcd", "UTF8_BINARY", false);
- assertContains("aBcDe", "BcD", "UTF8_BINARY", true);
- assertContains("aBcDe", "abcde", "UNICODE", false);
- assertContains("aBcDe", "aBcDe", "UNICODE", true);
- assertContains("aBcDe", "bcd", "UTF8_LCASE", true);
- assertContains("aBcDe", "BCD", "UTF8_LCASE", true);
- assertContains("aBcDe", "abcde", "UNICODE_CI", true);
- assertContains("aBcDe", "AbCdE", "UNICODE_CI", true);
- // Accent variation
- assertContains("aBcDe", "bćd", "UTF8_BINARY", false);
- assertContains("aBcDe", "BćD", "UTF8_BINARY", false);
- assertContains("aBcDe", "abćde", "UNICODE", false);
- assertContains("aBcDe", "aBćDe", "UNICODE", false);
- assertContains("aBcDe", "bćd", "UTF8_LCASE", false);
- assertContains("aBcDe", "BĆD", "UTF8_LCASE", false);
- assertContains("aBcDe", "abćde", "UNICODE_CI", false);
- assertContains("aBcDe", "AbĆdE", "UNICODE_CI", false);
- // Variable byte length characters
assertContains("ab世De", "b世D", "UTF8_BINARY", true);
assertContains("ab世De", "B世d", "UTF8_BINARY", false);
assertContains("äbćδe", "bćδ", "UTF8_BINARY", true);
@@ -268,45 +330,181 @@ public void testContains() throws SparkException {
assertContains("ab世De", "AB世dE", "UNICODE_CI", true);
assertContains("äbćδe", "ÄbćδE", "UNICODE_CI", true);
assertContains("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false);
- // Characters with the same binary lowercase representation
assertContains("The Kelvin.", "Kelvin", "UTF8_LCASE", true);
assertContains("The Kelvin.", "Kelvin", "UTF8_LCASE", true);
assertContains("The KKelvin.", "KKelvin", "UTF8_LCASE", true);
assertContains("2 Kelvin.", "2 Kelvin", "UTF8_LCASE", true);
assertContains("2 Kelvin.", "2 Kelvin", "UTF8_LCASE", true);
assertContains("The KKelvin.", "KKelvin,", "UTF8_LCASE", false);
- // Case-variable character length
- assertContains("i̇", "i", "UNICODE_CI", false);
- assertContains("i̇", "\u0307", "UNICODE_CI", false);
- assertContains("i̇", "İ", "UNICODE_CI", true);
+ assertContains("abčćd", "ABCCD", "SR_CI_AI", true);
+ // Case variation.
+ assertContains("aBcDe", "bcd", "UTF8_BINARY", false);
+ assertContains("aBcDe", "BcD", "UTF8_BINARY", true);
+ assertContains("aBcDe", "abcde", "UNICODE", false);
+ assertContains("aBcDe", "aBcDe", "UNICODE", true);
+ assertContains("aBcDe", "bcd", "UTF8_LCASE", true);
+ assertContains("aBcDe", "BCD", "UTF8_LCASE", true);
+ assertContains("aBcDe", "abcde", "UNICODE_CI", true);
+ assertContains("aBcDe", "AbCdE", "UNICODE_CI", true);
+ // Accent variation.
+ assertContains("aBcDe", "bćd", "UTF8_BINARY", false);
+ assertContains("aBcDe", "BćD", "UTF8_BINARY", false);
+ assertContains("aBcDe", "abćde", "UNICODE", false);
+ assertContains("aBcDe", "aBćDe", "UNICODE", false);
+ assertContains("aBcDe", "bćd", "UTF8_LCASE", false);
+ assertContains("aBcDe", "BĆD", "UTF8_LCASE", false);
+ assertContains("aBcDe", "abćde", "UNICODE_CI", false);
+ assertContains("aBcDe", "AbĆdE", "UNICODE_CI", false);
+ assertContains("abEEE", "Bèêë", "AF_CI_AI", true);
+ // One-to-many case mapping (e.g. Turkish dotted I).
+ assertContains("i\u0307", "i", "UNICODE_CI", false);
+ assertContains("i\u0307", "\u0307", "UNICODE_CI", false);
+ assertContains("i\u0307", "İ", "UNICODE_CI", true);
assertContains("İ", "i", "UNICODE_CI", false);
assertContains("adi̇os", "io", "UNICODE_CI", false);
assertContains("adi̇os", "Io", "UNICODE_CI", false);
- assertContains("adi̇os", "i̇o", "UNICODE_CI", true);
+ assertContains("adi̇os", "i\u0307o", "UNICODE_CI", true);
assertContains("adi̇os", "İo", "UNICODE_CI", true);
assertContains("adİos", "io", "UNICODE_CI", false);
assertContains("adİos", "Io", "UNICODE_CI", false);
- assertContains("adİos", "i̇o", "UNICODE_CI", true);
+ assertContains("adİos", "i\u0307o", "UNICODE_CI", true);
assertContains("adİos", "İo", "UNICODE_CI", true);
- assertContains("i̇", "i", "UTF8_LCASE", true); // != UNICODE_CI
+ assertContains("i\u0307", "i", "UTF8_LCASE", true); // != UNICODE_CI
assertContains("İ", "\u0307", "UTF8_LCASE", false);
assertContains("İ", "i", "UTF8_LCASE", false);
- assertContains("i̇", "\u0307", "UTF8_LCASE", true); // != UNICODE_CI
- assertContains("i̇", "İ", "UTF8_LCASE", true);
+ assertContains("i\u0307", "\u0307", "UTF8_LCASE", true); // != UNICODE_CI
+ assertContains("i\u0307", "İ", "UTF8_LCASE", true);
assertContains("İ", "i", "UTF8_LCASE", false);
assertContains("adi̇os", "io", "UTF8_LCASE", false);
assertContains("adi̇os", "Io", "UTF8_LCASE", false);
- assertContains("adi̇os", "i̇o", "UTF8_LCASE", true);
+ assertContains("adi̇os", "i\u0307o", "UTF8_LCASE", true);
assertContains("adi̇os", "İo", "UTF8_LCASE", true);
assertContains("adİos", "io", "UTF8_LCASE", false);
assertContains("adİos", "Io", "UTF8_LCASE", false);
- assertContains("adİos", "i̇o", "UTF8_LCASE", true);
+ assertContains("adİos", "i\u0307o", "UTF8_LCASE", true);
assertContains("adİos", "İo", "UTF8_LCASE", true);
+ // Conditional case mapping (e.g. Greek sigmas).
+ assertContains("σ", "σ", "UTF8_BINARY", true);
+ assertContains("σ", "ς", "UTF8_BINARY", false);
+ assertContains("σ", "Σ", "UTF8_BINARY", false);
+ assertContains("ς", "σ", "UTF8_BINARY", false);
+ assertContains("ς", "ς", "UTF8_BINARY", true);
+ assertContains("ς", "Σ", "UTF8_BINARY", false);
+ assertContains("Σ", "σ", "UTF8_BINARY", false);
+ assertContains("Σ", "ς", "UTF8_BINARY", false);
+ assertContains("Σ", "Σ", "UTF8_BINARY", true);
+ assertContains("σ", "σ", "UTF8_LCASE", true);
+ assertContains("σ", "ς", "UTF8_LCASE", true);
+ assertContains("σ", "Σ", "UTF8_LCASE", true);
+ assertContains("ς", "σ", "UTF8_LCASE", true);
+ assertContains("ς", "ς", "UTF8_LCASE", true);
+ assertContains("ς", "Σ", "UTF8_LCASE", true);
+ assertContains("Σ", "σ", "UTF8_LCASE", true);
+ assertContains("Σ", "ς", "UTF8_LCASE", true);
+ assertContains("Σ", "Σ", "UTF8_LCASE", true);
+ assertContains("σ", "σ", "UNICODE", true);
+ assertContains("σ", "ς", "UNICODE", false);
+ assertContains("σ", "Σ", "UNICODE", false);
+ assertContains("ς", "σ", "UNICODE", false);
+ assertContains("ς", "ς", "UNICODE", true);
+ assertContains("ς", "Σ", "UNICODE", false);
+ assertContains("Σ", "σ", "UNICODE", false);
+ assertContains("Σ", "ς", "UNICODE", false);
+ assertContains("Σ", "Σ", "UNICODE", true);
+ assertContains("σ", "σ", "UNICODE_CI", true);
+ assertContains("σ", "ς", "UNICODE_CI", true);
+ assertContains("σ", "Σ", "UNICODE_CI", true);
+ assertContains("ς", "σ", "UNICODE_CI", true);
+ assertContains("ς", "ς", "UNICODE_CI", true);
+ assertContains("ς", "Σ", "UNICODE_CI", true);
+ assertContains("Σ", "σ", "UNICODE_CI", true);
+ assertContains("Σ", "ς", "UNICODE_CI", true);
+ assertContains("Σ", "Σ", "UNICODE_CI", true);
+ assertContains("ΣΑΛΑΤΑ", "Σ", "UTF8_BINARY", true);
+ assertContains("ΣΑΛΑΤΑ", "σ", "UTF8_BINARY", false);
+ assertContains("ΣΑΛΑΤΑ", "ς", "UTF8_BINARY", false);
+ assertContains("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UTF8_BINARY", true);
+ assertContains("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UTF8_BINARY", false);
+ assertContains("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UTF8_BINARY", false);
+ assertContains("ΣΑΛΑΤΑ", "Σ", "UTF8_LCASE", true);
+ assertContains("ΣΑΛΑΤΑ", "σ", "UTF8_LCASE", true);
+ assertContains("ΣΑΛΑΤΑ", "ς", "UTF8_LCASE", true);
+ assertContains("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UTF8_LCASE", true);
+ assertContains("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UTF8_LCASE", true);
+ assertContains("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UTF8_LCASE", true);
+ assertContains("ΣΑΛΑΤΑ", "Σ", "UNICODE", true);
+ assertContains("ΣΑΛΑΤΑ", "σ", "UNICODE", false);
+ assertContains("ΣΑΛΑΤΑ", "ς", "UNICODE", false);
+ assertContains("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UNICODE", true);
+ assertContains("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UNICODE", false);
+ assertContains("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UNICODE", false);
+ assertContains("ΣΑΛΑΤΑ", "Σ", "UNICODE_CI", true);
+ assertContains("ΣΑΛΑΤΑ", "σ", "UNICODE_CI", true);
+ assertContains("ΣΑΛΑΤΑ", "ς", "UNICODE_CI", true);
+ assertContains("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UNICODE_CI", true);
+ assertContains("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UNICODE_CI", true);
+ assertContains("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UNICODE_CI", true);
+ // Surrogate pairs.
+ assertContains("a🙃b🙃c", "x", "UTF8_BINARY", false);
+ assertContains("a🙃b🙃c", "x", "UTF8_LCASE", false);
+ assertContains("a🙃b🙃c", "x", "UNICODE", false);
+ assertContains("a🙃b🙃c", "x", "UNICODE_CI", false);
+ assertContains("a🙃b🙃c", "b", "UTF8_BINARY", true);
+ assertContains("a🙃b🙃c", "b", "UTF8_LCASE", true);
+ assertContains("a🙃b🙃c", "b", "UNICODE", true);
+ assertContains("a🙃b🙃c", "b", "UNICODE_CI", true);
+ assertContains("a🙃b🙃c", "a🙃b", "UTF8_BINARY", true);
+ assertContains("a🙃b🙃c", "a🙃b", "UTF8_LCASE", true);
+ assertContains("a🙃b🙃c", "a🙃b", "UNICODE", true);
+ assertContains("a🙃b🙃c", "a🙃b", "UNICODE_CI", true);
+ assertContains("a🙃b🙃c", "b🙃c", "UTF8_BINARY", true);
+ assertContains("a🙃b🙃c", "b🙃c", "UTF8_LCASE", true);
+ assertContains("a🙃b🙃c", "b🙃c", "UNICODE", true);
+ assertContains("a🙃b🙃c", "b🙃c", "UNICODE_CI", true);
+ assertContains("a🙃b🙃c", "a🙃b🙃c", "UTF8_BINARY", true);
+ assertContains("a🙃b🙃c", "a🙃b🙃c", "UTF8_LCASE", true);
+ assertContains("a🙃b🙃c", "a🙃b🙃c", "UNICODE", true);
+ assertContains("a🙃b🙃c", "a🙃b🙃c", "UNICODE_CI", true);
+ assertContains("😀😆😃😄", "😄😆", "UTF8_BINARY", false);
+ assertContains("😀😆😃😄", "😄😆", "UTF8_LCASE", false);
+ assertContains("😀😆😃😄", "😄😆", "UNICODE", false);
+ assertContains("😀😆😃😄", "😄😆", "UNICODE_CI", false);
+ assertContains("😀😆😃😄", "😆😃", "UTF8_BINARY", true);
+ assertContains("😀😆😃😄", "😆😃", "UTF8_LCASE", true);
+ assertContains("😀😆😃😄", "😆😃", "UNICODE", true);
+ assertContains("😀😆😃😄", "😆😃", "UNICODE_CI", true);
+ assertContains("😀😆😃😄", "😀😆", "UTF8_BINARY", true);
+ assertContains("😀😆😃😄", "😀😆", "UTF8_LCASE", true);
+ assertContains("😀😆😃😄", "😀😆", "UNICODE", true);
+ assertContains("😀😆😃😄", "😀😆", "UNICODE_CI", true);
+ assertContains("😀😆😃😄", "😃😄", "UTF8_BINARY", true);
+ assertContains("😀😆😃😄", "😃😄", "UTF8_LCASE", true);
+ assertContains("😀😆😃😄", "😃😄", "UNICODE", true);
+ assertContains("😀😆😃😄", "😃😄", "UNICODE_CI", true);
+ assertContains("😀😆😃😄", "😀😆😃😄", "UTF8_BINARY", true);
+ assertContains("😀😆😃😄", "😀😆😃😄", "UTF8_LCASE", true);
+ assertContains("😀😆😃😄", "😀😆😃😄", "UNICODE", true);
+ assertContains("😀😆😃😄", "😀😆😃😄", "UNICODE_CI", true);
+ assertContains("𐐅", "𐐅", "UTF8_BINARY", true);
+ assertContains("𐐅", "𐐅", "UTF8_LCASE", true);
+ assertContains("𐐅", "𐐅", "UNICODE", true);
+ assertContains("𐐅", "𐐅", "UNICODE_CI", true);
+ assertContains("𐐅", "𐐭", "UTF8_BINARY", false);
+ assertContains("𐐅", "𐐭", "UTF8_LCASE", true);
+ assertContains("𐐅", "𐐭", "UNICODE", false);
+ assertContains("𐐅", "𐐭", "UNICODE_CI", true);
+ assertContains("𝔸", "𝔸", "UTF8_BINARY", true);
+ assertContains("𝔸", "𝔸", "UTF8_LCASE", true);
+ assertContains("𝔸", "𝔸", "UNICODE", true);
+ assertContains("𝔸", "𝔸", "UNICODE_CI", true);
}
- private void assertStartsWith(
- String pattern, String prefix, String collationName, boolean expected)
- throws SparkException {
+ /**
+ * Verify the behaviour of the `StartsWith` collation support class.
+ */
+
+ private void assertStartsWith(String pattern, String prefix, String collationName,
+ boolean expected) throws SparkException {
UTF8String l = UTF8String.fromString(pattern);
UTF8String r = UTF8String.fromString(prefix);
int collationId = CollationFactory.collationNameToId(collationName);
@@ -315,20 +513,42 @@ private void assertStartsWith(
@Test
public void testStartsWith() throws SparkException {
- // Edge cases
- assertStartsWith("", "", "UTF8_BINARY", true);
- assertStartsWith("c", "", "UTF8_BINARY", true);
- assertStartsWith("", "c", "UTF8_BINARY", false);
- assertStartsWith("", "", "UNICODE", true);
- assertStartsWith("c", "", "UNICODE", true);
- assertStartsWith("", "c", "UNICODE", false);
- assertStartsWith("", "", "UTF8_LCASE", true);
- assertStartsWith("c", "", "UTF8_LCASE", true);
- assertStartsWith("", "c", "UTF8_LCASE", false);
- assertStartsWith("", "", "UNICODE_CI", true);
- assertStartsWith("c", "", "UNICODE_CI", true);
- assertStartsWith("", "c", "UNICODE_CI", false);
- // Basic tests
+ for (String collationName: testSupportedCollations) {
+ // Empty strings.
+ assertStartsWith("", "", collationName, true);
+ assertStartsWith("a", "", collationName, true);
+ assertStartsWith("", "x", collationName, false);
+ // Basic tests.
+ assertStartsWith("a", "a", collationName, true);
+ assertStartsWith("_a_", "_a", collationName, true);
+ assertStartsWith("_a_", "a", collationName, false);
+ assertStartsWith("%a%", "%a", collationName, true);
+ assertStartsWith("%a%", "a", collationName, false);
+ assertStartsWith("*a*", "*a", collationName, true);
+ assertStartsWith("*a*", "a", collationName, false);
+ assertStartsWith("?a?", "?a", collationName, true);
+ assertStartsWith("?a?", "a", collationName, false);
+ assertStartsWith("/a/", "/a", collationName, true);
+ assertStartsWith("/a/", "a", collationName, false);
+ assertStartsWith("abcde", "xyz", collationName, false);
+ assertStartsWith("abcde", "bcd", collationName, false);
+ assertStartsWith("abcde", "abc", collationName, true);
+ assertStartsWith("abcde", "cde", collationName, false);
+ assertStartsWith("abcde", "abcde", collationName, true);
+ assertStartsWith("你好", "x", collationName, false);
+ assertStartsWith("你好", "你", collationName, true);
+ assertStartsWith("你好", "好", collationName, false);
+ assertStartsWith("你好", "你好", collationName, true);
+ assertStartsWith("Γειά", "x", collationName, false);
+ assertStartsWith("Γειά", "ειά", collationName, false);
+ assertStartsWith("Γειά", "Γει", collationName, true);
+ assertStartsWith("Γειά", "Γειά", collationName, true);
+ assertStartsWith("Здраво", "x", collationName, false);
+ assertStartsWith("Здраво", "драво", collationName, false);
+ assertStartsWith("Здраво", "Здрав", collationName, true);
+ assertStartsWith("Здраво", "Здраво", collationName, true);
+ }
+ // Advanced tests.
assertStartsWith("abcde", "abc", "UTF8_BINARY", true);
assertStartsWith("abcde", "abd", "UTF8_BINARY", false);
assertStartsWith("abcde", "fgh", "UTF8_BINARY", false);
@@ -342,25 +562,6 @@ public void testStartsWith() throws SparkException {
assertStartsWith("abcde", "aBC", "UNICODE_CI", true);
assertStartsWith("abcde", "bcd", "UNICODE_CI", false);
assertStartsWith("abcde", "123", "UNICODE_CI", false);
- // Case variation
- assertStartsWith("aBcDe", "abc", "UTF8_BINARY", false);
- assertStartsWith("aBcDe", "aBc", "UTF8_BINARY", true);
- assertStartsWith("aBcDe", "abcde", "UNICODE", false);
- assertStartsWith("aBcDe", "aBcDe", "UNICODE", true);
- assertStartsWith("aBcDe", "abc", "UTF8_LCASE", true);
- assertStartsWith("aBcDe", "ABC", "UTF8_LCASE", true);
- assertStartsWith("aBcDe", "abcde", "UNICODE_CI", true);
- assertStartsWith("aBcDe", "AbCdE", "UNICODE_CI", true);
- // Accent variation
- assertStartsWith("aBcDe", "abć", "UTF8_BINARY", false);
- assertStartsWith("aBcDe", "aBć", "UTF8_BINARY", false);
- assertStartsWith("aBcDe", "abćde", "UNICODE", false);
- assertStartsWith("aBcDe", "aBćDe", "UNICODE", false);
- assertStartsWith("aBcDe", "abć", "UTF8_LCASE", false);
- assertStartsWith("aBcDe", "ABĆ", "UTF8_LCASE", false);
- assertStartsWith("aBcDe", "abćde", "UNICODE_CI", false);
- assertStartsWith("aBcDe", "AbĆdE", "UNICODE_CI", false);
- // Variable byte length characters
assertStartsWith("ab世De", "ab世", "UTF8_BINARY", true);
assertStartsWith("ab世De", "aB世", "UTF8_BINARY", false);
assertStartsWith("äbćδe", "äbć", "UTF8_BINARY", true);
@@ -377,16 +578,38 @@ public void testStartsWith() throws SparkException {
assertStartsWith("ab世De", "AB世dE", "UNICODE_CI", true);
assertStartsWith("äbćδe", "ÄbćδE", "UNICODE_CI", true);
assertStartsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false);
- // Characters with the same binary lowercase representation
assertStartsWith("Kelvin.", "Kelvin", "UTF8_LCASE", true);
assertStartsWith("Kelvin.", "Kelvin", "UTF8_LCASE", true);
assertStartsWith("KKelvin.", "KKelvin", "UTF8_LCASE", true);
assertStartsWith("2 Kelvin.", "2 Kelvin", "UTF8_LCASE", true);
assertStartsWith("2 Kelvin.", "2 Kelvin", "UTF8_LCASE", true);
assertStartsWith("KKelvin.", "KKelvin,", "UTF8_LCASE", false);
- // Case-variable character length
- assertStartsWith("i̇", "i", "UNICODE_CI", false);
- assertStartsWith("i̇", "İ", "UNICODE_CI", true);
+ assertStartsWith("Ћао", "Ца", "sr_Cyrl_CI_AI", false);
+ assertStartsWith("Ћао", "ћа", "sr_Cyrl_CI_AI", true);
+ assertStartsWith("Ćao", "Ca", "SR_CI", false);
+ assertStartsWith("Ćao", "Ca", "SR_CI_AI", true);
+ assertStartsWith("Ćao", "Ća", "SR", true);
+ // Case variation.
+ assertStartsWith("aBcDe", "abc", "UTF8_BINARY", false);
+ assertStartsWith("aBcDe", "aBc", "UTF8_BINARY", true);
+ assertStartsWith("aBcDe", "abcde", "UNICODE", false);
+ assertStartsWith("aBcDe", "aBcDe", "UNICODE", true);
+ assertStartsWith("aBcDe", "abc", "UTF8_LCASE", true);
+ assertStartsWith("aBcDe", "ABC", "UTF8_LCASE", true);
+ assertStartsWith("aBcDe", "abcde", "UNICODE_CI", true);
+ assertStartsWith("aBcDe", "AbCdE", "UNICODE_CI", true);
+ // Accent variation.
+ assertStartsWith("aBcDe", "abć", "UTF8_BINARY", false);
+ assertStartsWith("aBcDe", "aBć", "UTF8_BINARY", false);
+ assertStartsWith("aBcDe", "abćde", "UNICODE", false);
+ assertStartsWith("aBcDe", "aBćDe", "UNICODE", false);
+ assertStartsWith("aBcDe", "abć", "UTF8_LCASE", false);
+ assertStartsWith("aBcDe", "ABĆ", "UTF8_LCASE", false);
+ assertStartsWith("aBcDe", "abćde", "UNICODE_CI", false);
+ assertStartsWith("aBcDe", "AbĆdE", "UNICODE_CI", false);
+ // One-to-many case mapping (e.g. Turkish dotted I).
+ assertStartsWith("i\u0307", "i", "UNICODE_CI", false);
+ assertStartsWith("i\u0307", "İ", "UNICODE_CI", true);
assertStartsWith("İ", "i", "UNICODE_CI", false);
assertStartsWith("İİİ", "i̇i̇", "UNICODE_CI", true);
assertStartsWith("İİİ", "i̇i", "UNICODE_CI", false);
@@ -394,14 +617,14 @@ public void testStartsWith() throws SparkException {
assertStartsWith("i̇İi̇i̇", "İi̇İi", "UNICODE_CI", false);
assertStartsWith("i̇onic", "io", "UNICODE_CI", false);
assertStartsWith("i̇onic", "Io", "UNICODE_CI", false);
- assertStartsWith("i̇onic", "i̇o", "UNICODE_CI", true);
+ assertStartsWith("i̇onic", "i\u0307o", "UNICODE_CI", true);
assertStartsWith("i̇onic", "İo", "UNICODE_CI", true);
assertStartsWith("İonic", "io", "UNICODE_CI", false);
assertStartsWith("İonic", "Io", "UNICODE_CI", false);
- assertStartsWith("İonic", "i̇o", "UNICODE_CI", true);
+ assertStartsWith("İonic", "i\u0307o", "UNICODE_CI", true);
assertStartsWith("İonic", "İo", "UNICODE_CI", true);
- assertStartsWith("i̇", "i", "UTF8_LCASE", true); // != UNICODE_CI
- assertStartsWith("i̇", "İ", "UTF8_LCASE", true);
+ assertStartsWith("i\u0307", "i", "UTF8_LCASE", true); // != UNICODE_CI
+ assertStartsWith("i\u0307", "İ", "UTF8_LCASE", true);
assertStartsWith("İ", "i", "UTF8_LCASE", false);
assertStartsWith("İİİ", "i̇i̇", "UTF8_LCASE", true);
assertStartsWith("İİİ", "i̇i", "UTF8_LCASE", false);
@@ -409,16 +632,136 @@ public void testStartsWith() throws SparkException {
assertStartsWith("i̇İi̇i̇", "İi̇İi", "UTF8_LCASE", true); // != UNICODE_CI
assertStartsWith("i̇onic", "io", "UTF8_LCASE", false);
assertStartsWith("i̇onic", "Io", "UTF8_LCASE", false);
- assertStartsWith("i̇onic", "i̇o", "UTF8_LCASE", true);
+ assertStartsWith("i̇onic", "i\u0307o", "UTF8_LCASE", true);
assertStartsWith("i̇onic", "İo", "UTF8_LCASE", true);
assertStartsWith("İonic", "io", "UTF8_LCASE", false);
assertStartsWith("İonic", "Io", "UTF8_LCASE", false);
- assertStartsWith("İonic", "i̇o", "UTF8_LCASE", true);
+ assertStartsWith("İonic", "i\u0307o", "UTF8_LCASE", true);
assertStartsWith("İonic", "İo", "UTF8_LCASE", true);
+ assertStartsWith("oİ", "oİ", "UTF8_LCASE", true);
+ assertStartsWith("oİ", "oi̇", "UTF8_LCASE", true);
+ // Conditional case mapping (e.g. Greek sigmas).
+ assertStartsWith("σ", "σ", "UTF8_BINARY", true);
+ assertStartsWith("σ", "ς", "UTF8_BINARY", false);
+ assertStartsWith("σ", "Σ", "UTF8_BINARY", false);
+ assertStartsWith("ς", "σ", "UTF8_BINARY", false);
+ assertStartsWith("ς", "ς", "UTF8_BINARY", true);
+ assertStartsWith("ς", "Σ", "UTF8_BINARY", false);
+ assertStartsWith("Σ", "σ", "UTF8_BINARY", false);
+ assertStartsWith("Σ", "ς", "UTF8_BINARY", false);
+ assertStartsWith("Σ", "Σ", "UTF8_BINARY", true);
+ assertStartsWith("σ", "σ", "UTF8_LCASE", true);
+ assertStartsWith("σ", "ς", "UTF8_LCASE", true);
+ assertStartsWith("σ", "Σ", "UTF8_LCASE", true);
+ assertStartsWith("ς", "σ", "UTF8_LCASE", true);
+ assertStartsWith("ς", "ς", "UTF8_LCASE", true);
+ assertStartsWith("ς", "Σ", "UTF8_LCASE", true);
+ assertStartsWith("Σ", "σ", "UTF8_LCASE", true);
+ assertStartsWith("Σ", "ς", "UTF8_LCASE", true);
+ assertStartsWith("Σ", "Σ", "UTF8_LCASE", true);
+ assertStartsWith("σ", "σ", "UNICODE", true);
+ assertStartsWith("σ", "ς", "UNICODE", false);
+ assertStartsWith("σ", "Σ", "UNICODE", false);
+ assertStartsWith("ς", "σ", "UNICODE", false);
+ assertStartsWith("ς", "ς", "UNICODE", true);
+ assertStartsWith("ς", "Σ", "UNICODE", false);
+ assertStartsWith("Σ", "σ", "UNICODE", false);
+ assertStartsWith("Σ", "ς", "UNICODE", false);
+ assertStartsWith("Σ", "Σ", "UNICODE", true);
+ assertStartsWith("σ", "σ", "UNICODE_CI", true);
+ assertStartsWith("σ", "ς", "UNICODE_CI", true);
+ assertStartsWith("σ", "Σ", "UNICODE_CI", true);
+ assertStartsWith("ς", "σ", "UNICODE_CI", true);
+ assertStartsWith("ς", "ς", "UNICODE_CI", true);
+ assertStartsWith("ς", "Σ", "UNICODE_CI", true);
+ assertStartsWith("Σ", "σ", "UNICODE_CI", true);
+ assertStartsWith("Σ", "ς", "UNICODE_CI", true);
+ assertStartsWith("Σ", "Σ", "UNICODE_CI", true);
+ assertStartsWith("ΣΑΛΑΤΑ", "Σ", "UTF8_BINARY", true);
+ assertStartsWith("ΣΑΛΑΤΑ", "σ", "UTF8_BINARY", false);
+ assertStartsWith("ΣΑΛΑΤΑ", "ς", "UTF8_BINARY", false);
+ assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UTF8_BINARY", false);
+ assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UTF8_BINARY", false);
+ assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UTF8_BINARY", false);
+ assertStartsWith("ΣΑΛΑΤΑ", "Σ", "UTF8_LCASE", true);
+ assertStartsWith("ΣΑΛΑΤΑ", "σ", "UTF8_LCASE", true);
+ assertStartsWith("ΣΑΛΑΤΑ", "ς", "UTF8_LCASE", true);
+ assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UTF8_LCASE", false);
+ assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UTF8_LCASE", false);
+ assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UTF8_LCASE", false);
+ assertStartsWith("ΣΑΛΑΤΑ", "Σ", "UNICODE", true);
+ assertStartsWith("ΣΑΛΑΤΑ", "σ", "UNICODE", false);
+ assertStartsWith("ΣΑΛΑΤΑ", "ς", "UNICODE", false);
+ assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UNICODE", false);
+ assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UNICODE", false);
+ assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UNICODE", false);
+ assertStartsWith("ΣΑΛΑΤΑ", "Σ", "UNICODE_CI", true);
+ assertStartsWith("ΣΑΛΑΤΑ", "σ", "UNICODE_CI", true);
+ assertStartsWith("ΣΑΛΑΤΑ", "ς", "UNICODE_CI", true);
+ assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UNICODE_CI", false);
+ assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UNICODE_CI", false);
+ assertStartsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UNICODE_CI", false);
+ // Surrogate pairs.
+ assertStartsWith("a🙃b🙃c", "x", "UTF8_BINARY", false);
+ assertStartsWith("a🙃b🙃c", "x", "UTF8_LCASE", false);
+ assertStartsWith("a🙃b🙃c", "x", "UNICODE", false);
+ assertStartsWith("a🙃b🙃c", "x", "UNICODE_CI", false);
+ assertStartsWith("a🙃b🙃c", "b", "UTF8_BINARY", false);
+ assertStartsWith("a🙃b🙃c", "b", "UTF8_LCASE", false);
+ assertStartsWith("a🙃b🙃c", "b", "UNICODE", false);
+ assertStartsWith("a🙃b🙃c", "b", "UNICODE_CI", false);
+ assertStartsWith("a🙃b🙃c", "a🙃b", "UTF8_BINARY", true);
+ assertStartsWith("a🙃b🙃c", "a🙃b", "UTF8_LCASE", true);
+ assertStartsWith("a🙃b🙃c", "a🙃b", "UNICODE", true);
+ assertStartsWith("a🙃b🙃c", "a🙃b", "UNICODE_CI", true);
+ assertStartsWith("a🙃b🙃c", "b🙃c", "UTF8_BINARY", false);
+ assertStartsWith("a🙃b🙃c", "b🙃c", "UTF8_LCASE", false);
+ assertStartsWith("a🙃b🙃c", "b🙃c", "UNICODE", false);
+ assertStartsWith("a🙃b🙃c", "b🙃c", "UNICODE_CI", false);
+ assertStartsWith("a🙃b🙃c", "a🙃b🙃c", "UTF8_BINARY", true);
+ assertStartsWith("a🙃b🙃c", "a🙃b🙃c", "UTF8_LCASE", true);
+ assertStartsWith("a🙃b🙃c", "a🙃b🙃c", "UNICODE", true);
+ assertStartsWith("a🙃b🙃c", "a🙃b🙃c", "UNICODE_CI", true);
+ assertStartsWith("😀😆😃😄", "😄😆", "UTF8_BINARY", false);
+ assertStartsWith("😀😆😃😄", "😄😆", "UTF8_LCASE", false);
+ assertStartsWith("😀😆😃😄", "😄😆", "UNICODE", false);
+ assertStartsWith("😀😆😃😄", "😄😆", "UNICODE_CI", false);
+ assertStartsWith("😀😆😃😄", "😆😃", "UTF8_BINARY", false);
+ assertStartsWith("😀😆😃😄", "😆😃", "UTF8_LCASE", false);
+ assertStartsWith("😀😆😃😄", "😆😃", "UNICODE", false);
+ assertStartsWith("😀😆😃😄", "😆😃", "UNICODE_CI", false);
+ assertStartsWith("😀😆😃😄", "😀😆", "UTF8_BINARY", true);
+ assertStartsWith("😀😆😃😄", "😀😆", "UTF8_LCASE", true);
+ assertStartsWith("😀😆😃😄", "😀😆", "UNICODE", true);
+ assertStartsWith("😀😆😃😄", "😀😆", "UNICODE_CI", true);
+ assertStartsWith("😀😆😃😄", "😃😄", "UTF8_BINARY", false);
+ assertStartsWith("😀😆😃😄", "😃😄", "UTF8_LCASE", false);
+ assertStartsWith("😀😆😃😄", "😃😄", "UNICODE", false);
+ assertStartsWith("😀😆😃😄", "😃😄", "UNICODE_CI", false);
+ assertStartsWith("😀😆😃😄", "😀😆😃😄", "UTF8_BINARY", true);
+ assertStartsWith("😀😆😃😄", "😀😆😃😄", "UTF8_LCASE", true);
+ assertStartsWith("😀😆😃😄", "😀😆😃😄", "UNICODE", true);
+ assertStartsWith("😀😆😃😄", "😀😆😃😄", "UNICODE_CI", true);
+ assertStartsWith("𐐅", "𐐅", "UTF8_BINARY", true);
+ assertStartsWith("𐐅", "𐐅", "UTF8_LCASE", true);
+ assertStartsWith("𐐅", "𐐅", "UNICODE", true);
+ assertStartsWith("𐐅", "𐐅", "UNICODE_CI", true);
+ assertStartsWith("𐐅", "𐐭", "UTF8_BINARY", false);
+ assertStartsWith("𐐅", "𐐭", "UTF8_LCASE", true);
+ assertStartsWith("𐐅", "𐐭", "UNICODE", false);
+ assertStartsWith("𐐅", "𐐭", "UNICODE_CI", true);
+ assertStartsWith("𝔸", "𝔸", "UTF8_BINARY", true);
+ assertStartsWith("𝔸", "𝔸", "UTF8_LCASE", true);
+ assertStartsWith("𝔸", "𝔸", "UNICODE", true);
+ assertStartsWith("𝔸", "𝔸", "UNICODE_CI", true);
}
- private void assertEndsWith(String pattern, String suffix, String collationName, boolean expected)
- throws SparkException {
+ /**
+ * Verify the behaviour of the `EndsWith` collation support class.
+ */
+
+ private void assertEndsWith(String pattern, String suffix, String collationName,
+ boolean expected) throws SparkException {
UTF8String l = UTF8String.fromString(pattern);
UTF8String r = UTF8String.fromString(suffix);
int collationId = CollationFactory.collationNameToId(collationName);
@@ -427,20 +770,42 @@ private void assertEndsWith(String pattern, String suffix, String collationName,
@Test
public void testEndsWith() throws SparkException {
- // Edge cases
- assertEndsWith("", "", "UTF8_BINARY", true);
- assertEndsWith("c", "", "UTF8_BINARY", true);
- assertEndsWith("", "c", "UTF8_BINARY", false);
- assertEndsWith("", "", "UNICODE", true);
- assertEndsWith("c", "", "UNICODE", true);
- assertEndsWith("", "c", "UNICODE", false);
- assertEndsWith("", "", "UTF8_LCASE", true);
- assertEndsWith("c", "", "UTF8_LCASE", true);
- assertEndsWith("", "c", "UTF8_LCASE", false);
- assertEndsWith("", "", "UNICODE_CI", true);
- assertEndsWith("c", "", "UNICODE_CI", true);
- assertEndsWith("", "c", "UNICODE_CI", false);
- // Basic tests
+ for (String collationName: testSupportedCollations) {
+ // Empty strings.
+ assertEndsWith("", "", collationName, true);
+ assertEndsWith("a", "", collationName, true);
+ assertEndsWith("", "x", collationName, false);
+ // Basic tests.
+ assertEndsWith("a", "a", collationName, true);
+ assertEndsWith("_a_", "a_", collationName, true);
+ assertEndsWith("_a_", "a", collationName, false);
+ assertEndsWith("%a%", "a%", collationName, true);
+ assertEndsWith("%a%", "a", collationName, false);
+ assertEndsWith("*a*", "a*", collationName, true);
+ assertEndsWith("*a*", "a", collationName, false);
+ assertEndsWith("?a?", "a?", collationName, true);
+ assertEndsWith("?a?", "a", collationName, false);
+ assertEndsWith("/a/", "a/", collationName, true);
+ assertEndsWith("/a/", "a", collationName, false);
+ assertEndsWith("abcde", "xyz", collationName, false);
+ assertEndsWith("abcde", "bcd", collationName, false);
+ assertEndsWith("abcde", "abc", collationName, false);
+ assertEndsWith("abcde", "cde", collationName, true);
+ assertEndsWith("abcde", "abcde", collationName, true);
+ assertEndsWith("你好", "x", collationName, false);
+ assertEndsWith("你好", "你", collationName, false);
+ assertEndsWith("你好", "好", collationName, true);
+ assertEndsWith("你好", "你好", collationName, true);
+ assertEndsWith("Γειά", "x", collationName, false);
+ assertEndsWith("Γειά", "ειά", collationName, true);
+ assertEndsWith("Γειά", "Γει", collationName, false);
+ assertEndsWith("Γειά", "Γειά", collationName, true);
+ assertEndsWith("Здраво", "x", collationName, false);
+ assertEndsWith("Здраво", "драво", collationName, true);
+ assertEndsWith("Здраво", "Здрав", collationName, false);
+ assertEndsWith("Здраво", "Здраво", collationName, true);
+ }
+ // Advanced tests.
assertEndsWith("abcde", "cde", "UTF8_BINARY", true);
assertEndsWith("abcde", "bde", "UTF8_BINARY", false);
assertEndsWith("abcde", "fgh", "UTF8_BINARY", false);
@@ -454,25 +819,6 @@ public void testEndsWith() throws SparkException {
assertEndsWith("abcde", "CDe", "UNICODE_CI", true);
assertEndsWith("abcde", "bcd", "UNICODE_CI", false);
assertEndsWith("abcde", "123", "UNICODE_CI", false);
- // Case variation
- assertEndsWith("aBcDe", "cde", "UTF8_BINARY", false);
- assertEndsWith("aBcDe", "cDe", "UTF8_BINARY", true);
- assertEndsWith("aBcDe", "abcde", "UNICODE", false);
- assertEndsWith("aBcDe", "aBcDe", "UNICODE", true);
- assertEndsWith("aBcDe", "cde", "UTF8_LCASE", true);
- assertEndsWith("aBcDe", "CDE", "UTF8_LCASE", true);
- assertEndsWith("aBcDe", "abcde", "UNICODE_CI", true);
- assertEndsWith("aBcDe", "AbCdE", "UNICODE_CI", true);
- // Accent variation
- assertEndsWith("aBcDe", "ćde", "UTF8_BINARY", false);
- assertEndsWith("aBcDe", "ćDe", "UTF8_BINARY", false);
- assertEndsWith("aBcDe", "abćde", "UNICODE", false);
- assertEndsWith("aBcDe", "aBćDe", "UNICODE", false);
- assertEndsWith("aBcDe", "ćde", "UTF8_LCASE", false);
- assertEndsWith("aBcDe", "ĆDE", "UTF8_LCASE", false);
- assertEndsWith("aBcDe", "abćde", "UNICODE_CI", false);
- assertEndsWith("aBcDe", "AbĆdE", "UNICODE_CI", false);
- // Variable byte length characters
assertEndsWith("ab世De", "世De", "UTF8_BINARY", true);
assertEndsWith("ab世De", "世dE", "UTF8_BINARY", false);
assertEndsWith("äbćδe", "ćδe", "UTF8_BINARY", true);
@@ -489,53 +835,196 @@ public void testEndsWith() throws SparkException {
assertEndsWith("ab世De", "AB世dE", "UNICODE_CI", true);
assertEndsWith("äbćδe", "ÄbćδE", "UNICODE_CI", true);
assertEndsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false);
- // Characters with the same binary lowercase representation
assertEndsWith("The Kelvin", "Kelvin", "UTF8_LCASE", true);
assertEndsWith("The Kelvin", "Kelvin", "UTF8_LCASE", true);
assertEndsWith("The KKelvin", "KKelvin", "UTF8_LCASE", true);
assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_LCASE", true);
assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_LCASE", true);
assertEndsWith("The KKelvin", "KKelvin,", "UTF8_LCASE", false);
- // Case-variable character length
- assertEndsWith("i̇", "\u0307", "UNICODE_CI", false);
- assertEndsWith("i̇", "İ", "UNICODE_CI", true);
+ assertEndsWith("Ћевапчићи", "цици", "sr_Cyrl_CI_AI", false);
+ assertEndsWith("Ћевапчићи", "чИЋи", "sr_Cyrl_CI_AI", true);
+ assertEndsWith("Ćevapčići", "cici", "SR_CI", false);
+ assertEndsWith("Ćevapčići", "cici", "SR_CI_AI", true);
+ assertEndsWith("Ćevapčići", "čići", "SR", true);
+ // Case variation.
+ assertEndsWith("aBcDe", "cde", "UTF8_BINARY", false);
+ assertEndsWith("aBcDe", "cDe", "UTF8_BINARY", true);
+ assertEndsWith("aBcDe", "abcde", "UNICODE", false);
+ assertEndsWith("aBcDe", "aBcDe", "UNICODE", true);
+ assertEndsWith("aBcDe", "cde", "UTF8_LCASE", true);
+ assertEndsWith("aBcDe", "CDE", "UTF8_LCASE", true);
+ assertEndsWith("aBcDe", "abcde", "UNICODE_CI", true);
+ assertEndsWith("aBcDe", "AbCdE", "UNICODE_CI", true);
+ // Accent variation.
+ assertEndsWith("aBcDe", "ćde", "UTF8_BINARY", false);
+ assertEndsWith("aBcDe", "ćDe", "UTF8_BINARY", false);
+ assertEndsWith("aBcDe", "abćde", "UNICODE", false);
+ assertEndsWith("aBcDe", "aBćDe", "UNICODE", false);
+ assertEndsWith("aBcDe", "ćde", "UTF8_LCASE", false);
+ assertEndsWith("aBcDe", "ĆDE", "UTF8_LCASE", false);
+ assertEndsWith("aBcDe", "abćde", "UNICODE_CI", false);
+ assertEndsWith("aBcDe", "AbĆdE", "UNICODE_CI", false);
+ // One-to-many case mapping (e.g. Turkish dotted I).
+ assertEndsWith("i\u0307", "\u0307", "UNICODE_CI", false);
+ assertEndsWith("i\u0307", "İ", "UNICODE_CI", true);
assertEndsWith("İ", "i", "UNICODE_CI", false);
assertEndsWith("İİİ", "i̇i̇", "UNICODE_CI", true);
assertEndsWith("İİİ", "ii̇", "UNICODE_CI", false);
assertEndsWith("İi̇İ", "İi̇", "UNICODE_CI", true);
assertEndsWith("i̇İi̇i̇", "\u0307İi̇İ", "UNICODE_CI", false);
- assertEndsWith("the i̇o", "io", "UNICODE_CI", false);
- assertEndsWith("the i̇o", "Io", "UNICODE_CI", false);
- assertEndsWith("the i̇o", "i̇o", "UNICODE_CI", true);
- assertEndsWith("the i̇o", "İo", "UNICODE_CI", true);
+ assertEndsWith("the i\u0307o", "io", "UNICODE_CI", false);
+ assertEndsWith("the i\u0307o", "Io", "UNICODE_CI", false);
+ assertEndsWith("the i\u0307o", "i\u0307o", "UNICODE_CI", true);
+ assertEndsWith("the i\u0307o", "İo", "UNICODE_CI", true);
assertEndsWith("the İo", "io", "UNICODE_CI", false);
assertEndsWith("the İo", "Io", "UNICODE_CI", false);
- assertEndsWith("the İo", "i̇o", "UNICODE_CI", true);
+ assertEndsWith("the İo", "i\u0307o", "UNICODE_CI", true);
assertEndsWith("the İo", "İo", "UNICODE_CI", true);
- assertEndsWith("i̇", "\u0307", "UTF8_LCASE", true); // != UNICODE_CI
- assertEndsWith("i̇", "İ", "UTF8_LCASE", true);
+ assertEndsWith("i\u0307", "\u0307", "UTF8_LCASE", true); // != UNICODE_CI
+ assertEndsWith("i\u0307", "İ", "UTF8_LCASE", true);
assertEndsWith("İ", "\u0307", "UTF8_LCASE", false);
assertEndsWith("İİİ", "i̇i̇", "UTF8_LCASE", true);
assertEndsWith("İİİ", "ii̇", "UTF8_LCASE", false);
assertEndsWith("İi̇İ", "İi̇", "UTF8_LCASE", true);
assertEndsWith("i̇İi̇i̇", "\u0307İi̇İ", "UTF8_LCASE", true); // != UNICODE_CI
assertEndsWith("i̇İi̇i̇", "\u0307İİ", "UTF8_LCASE", false);
- assertEndsWith("the i̇o", "io", "UTF8_LCASE", false);
- assertEndsWith("the i̇o", "Io", "UTF8_LCASE", false);
- assertEndsWith("the i̇o", "i̇o", "UTF8_LCASE", true);
- assertEndsWith("the i̇o", "İo", "UTF8_LCASE", true);
+ assertEndsWith("the i\u0307o", "io", "UTF8_LCASE", false);
+ assertEndsWith("the i\u0307o", "Io", "UTF8_LCASE", false);
+ assertEndsWith("the i\u0307o", "i\u0307o", "UTF8_LCASE", true);
+ assertEndsWith("the i\u0307o", "İo", "UTF8_LCASE", true);
assertEndsWith("the İo", "io", "UTF8_LCASE", false);
assertEndsWith("the İo", "Io", "UTF8_LCASE", false);
- assertEndsWith("the İo", "i̇o", "UTF8_LCASE", true);
+ assertEndsWith("the İo", "i\u0307o", "UTF8_LCASE", true);
assertEndsWith("the İo", "İo", "UTF8_LCASE", true);
+ assertEndsWith("İo", "İo", "UTF8_LCASE", true);
+ assertEndsWith("İo", "i̇o", "UTF8_LCASE", true);
+ // Conditional case mapping (e.g. Greek sigmas).
+ assertEndsWith("σ", "σ", "UTF8_BINARY", true);
+ assertEndsWith("σ", "ς", "UTF8_BINARY", false);
+ assertEndsWith("σ", "Σ", "UTF8_BINARY", false);
+ assertEndsWith("ς", "σ", "UTF8_BINARY", false);
+ assertEndsWith("ς", "ς", "UTF8_BINARY", true);
+ assertEndsWith("ς", "Σ", "UTF8_BINARY", false);
+ assertEndsWith("Σ", "σ", "UTF8_BINARY", false);
+ assertEndsWith("Σ", "ς", "UTF8_BINARY", false);
+ assertEndsWith("Σ", "Σ", "UTF8_BINARY", true);
+ assertEndsWith("σ", "σ", "UTF8_LCASE", true);
+ assertEndsWith("σ", "ς", "UTF8_LCASE", true);
+ assertEndsWith("σ", "Σ", "UTF8_LCASE", true);
+ assertEndsWith("ς", "σ", "UTF8_LCASE", true);
+ assertEndsWith("ς", "ς", "UTF8_LCASE", true);
+ assertEndsWith("ς", "Σ", "UTF8_LCASE", true);
+ assertEndsWith("Σ", "σ", "UTF8_LCASE", true);
+ assertEndsWith("Σ", "ς", "UTF8_LCASE", true);
+ assertEndsWith("Σ", "Σ", "UTF8_LCASE", true);
+ assertEndsWith("σ", "σ", "UNICODE", true);
+ assertEndsWith("σ", "ς", "UNICODE", false);
+ assertEndsWith("σ", "Σ", "UNICODE", false);
+ assertEndsWith("ς", "σ", "UNICODE", false);
+ assertEndsWith("ς", "ς", "UNICODE", true);
+ assertEndsWith("ς", "Σ", "UNICODE", false);
+ assertEndsWith("Σ", "σ", "UNICODE", false);
+ assertEndsWith("Σ", "ς", "UNICODE", false);
+ assertEndsWith("Σ", "Σ", "UNICODE", true);
+ assertEndsWith("σ", "σ", "UNICODE_CI", true);
+ assertEndsWith("σ", "ς", "UNICODE_CI", true);
+ assertEndsWith("σ", "Σ", "UNICODE_CI", true);
+ assertEndsWith("ς", "σ", "UNICODE_CI", true);
+ assertEndsWith("ς", "ς", "UNICODE_CI", true);
+ assertEndsWith("ς", "Σ", "UNICODE_CI", true);
+ assertEndsWith("Σ", "σ", "UNICODE_CI", true);
+ assertEndsWith("Σ", "ς", "UNICODE_CI", true);
+ assertEndsWith("Σ", "Σ", "UNICODE_CI", true);
+ assertEndsWith("ΣΑΛΑΤΑ", "Σ", "UTF8_BINARY", false);
+ assertEndsWith("ΣΑΛΑΤΑ", "σ", "UTF8_BINARY", false);
+ assertEndsWith("ΣΑΛΑΤΑ", "ς", "UTF8_BINARY", false);
+ assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UTF8_BINARY", true);
+ assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UTF8_BINARY", false);
+ assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UTF8_BINARY", false);
+ assertEndsWith("ΣΑΛΑΤΑ", "Σ", "UTF8_LCASE", false);
+ assertEndsWith("ΣΑΛΑΤΑ", "σ", "UTF8_LCASE", false);
+ assertEndsWith("ΣΑΛΑΤΑ", "ς", "UTF8_LCASE", false);
+ assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UTF8_LCASE", true);
+ assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UTF8_LCASE", true);
+ assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UTF8_LCASE", true);
+ assertEndsWith("ΣΑΛΑΤΑ", "Σ", "UNICODE", false);
+ assertEndsWith("ΣΑΛΑΤΑ", "σ", "UNICODE", false);
+ assertEndsWith("ΣΑΛΑΤΑ", "ς", "UNICODE", false);
+ assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UNICODE", true);
+ assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UNICODE", false);
+ assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UNICODE", false);
+ assertEndsWith("ΣΑΛΑΤΑ", "Σ", "UNICODE_CI", false);
+ assertEndsWith("ΣΑΛΑΤΑ", "σ", "UNICODE_CI", false);
+ assertEndsWith("ΣΑΛΑΤΑ", "ς", "UNICODE_CI", false);
+ assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "Σ", "UNICODE_CI", true);
+ assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "σ", "UNICODE_CI", true);
+ assertEndsWith("ΘΑΛΑΣΣΙΝΟΣ", "ς", "UNICODE_CI", true);
+ // Surrogate pairs.
+ assertEndsWith("a🙃b🙃c", "x", "UTF8_BINARY", false);
+ assertEndsWith("a🙃b🙃c", "x", "UTF8_LCASE", false);
+ assertEndsWith("a🙃b🙃c", "x", "UNICODE", false);
+ assertEndsWith("a🙃b🙃c", "x", "UNICODE_CI", false);
+ assertEndsWith("a🙃b🙃c", "b", "UTF8_BINARY", false);
+ assertEndsWith("a🙃b🙃c", "b", "UTF8_LCASE", false);
+ assertEndsWith("a🙃b🙃c", "b", "UNICODE", false);
+ assertEndsWith("a🙃b🙃c", "b", "UNICODE_CI", false);
+ assertEndsWith("a🙃b🙃c", "a🙃b", "UTF8_BINARY", false);
+ assertEndsWith("a🙃b🙃c", "a🙃b", "UTF8_LCASE", false);
+ assertEndsWith("a🙃b🙃c", "a🙃b", "UNICODE", false);
+ assertEndsWith("a🙃b🙃c", "a🙃b", "UNICODE_CI", false);
+ assertEndsWith("a🙃b🙃c", "b🙃c", "UTF8_BINARY", true);
+ assertEndsWith("a🙃b🙃c", "b🙃c", "UTF8_LCASE", true);
+ assertEndsWith("a🙃b🙃c", "b🙃c", "UNICODE", true);
+ assertEndsWith("a🙃b🙃c", "b🙃c", "UNICODE_CI", true);
+ assertEndsWith("a🙃b🙃c", "a🙃b🙃c", "UTF8_BINARY", true);
+ assertEndsWith("a🙃b🙃c", "a🙃b🙃c", "UTF8_LCASE", true);
+ assertEndsWith("a🙃b🙃c", "a🙃b🙃c", "UNICODE", true);
+ assertEndsWith("a🙃b🙃c", "a🙃b🙃c", "UNICODE_CI", true);
+ assertEndsWith("😀😆😃😄", "😄😆", "UTF8_BINARY", false);
+ assertEndsWith("😀😆😃😄", "😄😆", "UTF8_LCASE", false);
+ assertEndsWith("😀😆😃😄", "😄😆", "UNICODE", false);
+ assertEndsWith("😀😆😃😄", "😄😆", "UNICODE_CI", false);
+ assertEndsWith("😀😆😃😄", "😆😃", "UTF8_BINARY", false);
+ assertEndsWith("😀😆😃😄", "😆😃", "UTF8_LCASE", false);
+ assertEndsWith("😀😆😃😄", "😆😃", "UNICODE", false);
+ assertEndsWith("😀😆😃😄", "😆😃", "UNICODE_CI", false);
+ assertEndsWith("😀😆😃😄", "😀😆", "UTF8_BINARY", false);
+ assertEndsWith("😀😆😃😄", "😀😆", "UTF8_LCASE", false);
+ assertEndsWith("😀😆😃😄", "😀😆", "UNICODE", false);
+ assertEndsWith("😀😆😃😄", "😀😆", "UNICODE_CI", false);
+ assertEndsWith("😀😆😃😄", "😃😄", "UTF8_BINARY", true);
+ assertEndsWith("😀😆😃😄", "😃😄", "UTF8_LCASE", true);
+ assertEndsWith("😀😆😃😄", "😃😄", "UNICODE", true);
+ assertEndsWith("😀😆😃😄", "😃😄", "UNICODE_CI", true);
+ assertEndsWith("😀😆😃😄", "😀😆😃😄", "UTF8_BINARY", true);
+ assertEndsWith("😀😆😃😄", "😀😆😃😄", "UTF8_LCASE", true);
+ assertEndsWith("😀😆😃😄", "😀😆😃😄", "UNICODE", true);
+ assertEndsWith("😀😆😃😄", "😀😆😃😄", "UNICODE_CI", true);
+ assertEndsWith("𐐅", "𐐅", "UTF8_BINARY", true);
+ assertEndsWith("𐐅", "𐐅", "UTF8_LCASE", true);
+ assertEndsWith("𐐅", "𐐅", "UNICODE", true);
+ assertEndsWith("𐐅", "𐐅", "UNICODE_CI", true);
+ assertEndsWith("𐐅", "𐐭", "UTF8_BINARY", false);
+ assertEndsWith("𐐅", "𐐭", "UTF8_LCASE", true);
+ assertEndsWith("𐐅", "𐐭", "UNICODE", false);
+ assertEndsWith("𐐅", "𐐭", "UNICODE_CI", true);
+ assertEndsWith("𝔸", "𝔸", "UTF8_BINARY", true);
+ assertEndsWith("𝔸", "𝔸", "UTF8_LCASE", true);
+ assertEndsWith("𝔸", "𝔸", "UNICODE", true);
+ assertEndsWith("𝔸", "𝔸", "UNICODE_CI", true);
}
+ /**
+ * Verify the behaviour of the `StringSplitSQL` collation support class.
+ */
+
private void assertStringSplitSQL(String str, String delimiter, String collationName,
UTF8String[] expected) throws SparkException {
UTF8String s = UTF8String.fromString(str);
UTF8String d = UTF8String.fromString(delimiter);
int collationId = CollationFactory.collationNameToId(collationName);
- assertArrayEquals(expected, CollationSupport.StringSplitSQL.exec(s, d, collationId));
+ UTF8String[] result = CollationSupport.StringSplitSQL.exec(s, d, collationId);
+ assertArrayEquals(expected, result);
}
@Test
@@ -553,7 +1042,21 @@ public void testStringSplitSQL() throws SparkException {
var array_A_B = new UTF8String[] { UTF8String.fromString("A"), UTF8String.fromString("B") };
var array_a_e = new UTF8String[] { UTF8String.fromString("ä"), UTF8String.fromString("e") };
var array_Aa_bB = new UTF8String[] { UTF8String.fromString("Aa"), UTF8String.fromString("bB") };
- // Edge cases
+ var array_Turkish_uppercase_dotted_I = new UTF8String[] { UTF8String.fromString("İ") };
+ var array_Turkish_lowercase_dotted_i = new UTF8String[] { UTF8String.fromString("i\u0307") };
+ var array_i = new UTF8String[] { UTF8String.fromString("i"), UTF8String.fromString("") };
+ var array_dot = new UTF8String[] { UTF8String.fromString(""), UTF8String.fromString("\u0307") };
+ var array_AiB = new UTF8String[] { UTF8String.fromString("Ai\u0307B") };
+ var array_AIB = new UTF8String[] { UTF8String.fromString("AİB") };
+ var array_small_nonfinal_sigma = new UTF8String[] { UTF8String.fromString("σ") };
+ var array_small_final_sigma = new UTF8String[] { UTF8String.fromString("ς") };
+ var array_capital_sigma = new UTF8String[] { UTF8String.fromString("Σ") };
+ var array_a_b_c = new UTF8String[] { UTF8String.fromString("a"), UTF8String.fromString("b"),
+ UTF8String.fromString("c") };
+ var array_emojis = new UTF8String[] { UTF8String.fromString("😀"), UTF8String.fromString("😄") };
+ var array_AOB = new UTF8String[] { UTF8String.fromString("A𐐅B") };
+ var array_AoB = new UTF8String[] { UTF8String.fromString("A𐐭B") };
+ // Empty strings.
assertStringSplitSQL("", "", "UTF8_BINARY", empty_match);
assertStringSplitSQL("abc", "", "UTF8_BINARY", array_abc);
assertStringSplitSQL("", "abc", "UTF8_BINARY", empty_match);
@@ -566,7 +1069,7 @@ public void testStringSplitSQL() throws SparkException {
assertStringSplitSQL("", "", "UNICODE_CI", empty_match);
assertStringSplitSQL("abc", "", "UNICODE_CI", array_abc);
assertStringSplitSQL("", "abc", "UNICODE_CI", empty_match);
- // Basic tests
+ // Basic tests.
assertStringSplitSQL("1a2", "a", "UTF8_BINARY", array_1_2);
assertStringSplitSQL("1a2", "A", "UTF8_BINARY", array_1a2);
assertStringSplitSQL("1a2", "b", "UTF8_BINARY", array_1a2);
@@ -580,25 +1083,7 @@ public void testStringSplitSQL() throws SparkException {
assertStringSplitSQL("1a2", "A", "UNICODE_CI", array_1_2);
assertStringSplitSQL("1a2", "1A2", "UNICODE_CI", full_match);
assertStringSplitSQL("1a2", "123", "UNICODE_CI", array_1a2);
- // Case variation
- assertStringSplitSQL("AaXbB", "x", "UTF8_BINARY", array_AaXbB);
- assertStringSplitSQL("AaXbB", "X", "UTF8_BINARY", array_Aa_bB);
- assertStringSplitSQL("AaXbB", "axb", "UNICODE", array_AaXbB);
- assertStringSplitSQL("AaXbB", "aXb", "UNICODE", array_A_B);
- assertStringSplitSQL("AaXbB", "axb", "UTF8_LCASE", array_A_B);
- assertStringSplitSQL("AaXbB", "AXB", "UTF8_LCASE", array_A_B);
- assertStringSplitSQL("AaXbB", "axb", "UNICODE_CI", array_A_B);
- assertStringSplitSQL("AaXbB", "AxB", "UNICODE_CI", array_A_B);
- // Accent variation
- assertStringSplitSQL("aBcDe", "bćd", "UTF8_BINARY", array_aBcDe);
- assertStringSplitSQL("aBcDe", "BćD", "UTF8_BINARY", array_aBcDe);
- assertStringSplitSQL("aBcDe", "abćde", "UNICODE", array_aBcDe);
- assertStringSplitSQL("aBcDe", "aBćDe", "UNICODE", array_aBcDe);
- assertStringSplitSQL("aBcDe", "bćd", "UTF8_LCASE", array_aBcDe);
- assertStringSplitSQL("aBcDe", "BĆD", "UTF8_LCASE", array_aBcDe);
- assertStringSplitSQL("aBcDe", "abćde", "UNICODE_CI", array_aBcDe);
- assertStringSplitSQL("aBcDe", "AbĆdE", "UNICODE_CI", array_aBcDe);
- // Variable byte length characters
+ // Advanced tests.
assertStringSplitSQL("äb世De", "b世D", "UTF8_BINARY", array_a_e);
assertStringSplitSQL("äb世De", "B世d", "UTF8_BINARY", array_special);
assertStringSplitSQL("äbćδe", "bćδ", "UTF8_BINARY", array_a_e);
@@ -615,10 +1100,123 @@ public void testStringSplitSQL() throws SparkException {
assertStringSplitSQL("äb世De", "AB世dE", "UNICODE_CI", array_special);
assertStringSplitSQL("äbćδe", "ÄbćδE", "UNICODE_CI", full_match);
assertStringSplitSQL("äbćδe", "ÄBcΔÉ", "UNICODE_CI", array_abcde);
+ // Case variation.
+ assertStringSplitSQL("AaXbB", "x", "UTF8_BINARY", array_AaXbB);
+ assertStringSplitSQL("AaXbB", "X", "UTF8_BINARY", array_Aa_bB);
+ assertStringSplitSQL("AaXbB", "axb", "UNICODE", array_AaXbB);
+ assertStringSplitSQL("AaXbB", "aXb", "UNICODE", array_A_B);
+ assertStringSplitSQL("AaXbB", "axb", "UTF8_LCASE", array_A_B);
+ assertStringSplitSQL("AaXbB", "AXB", "UTF8_LCASE", array_A_B);
+ assertStringSplitSQL("AaXbB", "axb", "UNICODE_CI", array_A_B);
+ assertStringSplitSQL("AaXbB", "AxB", "UNICODE_CI", array_A_B);
+ // Accent variation.
+ assertStringSplitSQL("aBcDe", "bćd", "UTF8_BINARY", array_aBcDe);
+ assertStringSplitSQL("aBcDe", "BćD", "UTF8_BINARY", array_aBcDe);
+ assertStringSplitSQL("aBcDe", "abćde", "UNICODE", array_aBcDe);
+ assertStringSplitSQL("aBcDe", "aBćDe", "UNICODE", array_aBcDe);
+ assertStringSplitSQL("aBcDe", "bćd", "UTF8_LCASE", array_aBcDe);
+ assertStringSplitSQL("aBcDe", "BĆD", "UTF8_LCASE", array_aBcDe);
+ assertStringSplitSQL("aBcDe", "abćde", "UNICODE_CI", array_aBcDe);
+ assertStringSplitSQL("aBcDe", "AbĆdE", "UNICODE_CI", array_aBcDe);
+ // One-to-many case mapping (e.g. Turkish dotted I).
+ assertStringSplitSQL("İ", "i", "UTF8_BINARY", array_Turkish_uppercase_dotted_I);
+ assertStringSplitSQL("İ", "i", "UTF8_LCASE", array_Turkish_uppercase_dotted_I);
+ assertStringSplitSQL("İ", "i", "UNICODE", array_Turkish_uppercase_dotted_I);
+ assertStringSplitSQL("İ", "i", "UNICODE_CI", array_Turkish_uppercase_dotted_I);
+ assertStringSplitSQL("İ", "\u0307", "UTF8_BINARY", array_Turkish_uppercase_dotted_I);
+ assertStringSplitSQL("İ", "\u0307", "UTF8_LCASE", array_Turkish_uppercase_dotted_I);
+ assertStringSplitSQL("İ", "\u0307", "UNICODE", array_Turkish_uppercase_dotted_I);
+ assertStringSplitSQL("İ", "\u0307", "UNICODE_CI", array_Turkish_uppercase_dotted_I);
+ assertStringSplitSQL("i\u0307", "i", "UTF8_BINARY", array_dot);
+ assertStringSplitSQL("i\u0307", "i", "UTF8_LCASE", array_dot);
+ assertStringSplitSQL("i\u0307", "i", "UNICODE", array_Turkish_lowercase_dotted_i);
+ assertStringSplitSQL("i\u0307", "i", "UNICODE_CI", array_Turkish_lowercase_dotted_i);
+ assertStringSplitSQL("i\u0307", "\u0307", "UTF8_BINARY", array_i);
+ assertStringSplitSQL("i\u0307", "\u0307", "UTF8_LCASE", array_i);
+ assertStringSplitSQL("i\u0307", "\u0307", "UNICODE", array_Turkish_lowercase_dotted_i);
+ assertStringSplitSQL("i\u0307", "\u0307", "UNICODE_CI", array_Turkish_lowercase_dotted_i);
+ assertStringSplitSQL("AİB", "İ", "UTF8_BINARY", array_A_B);
+ assertStringSplitSQL("AİB", "İ", "UTF8_LCASE", array_A_B);
+ assertStringSplitSQL("AİB", "İ", "UNICODE", array_A_B);
+ assertStringSplitSQL("AİB", "İ", "UNICODE_CI", array_A_B);
+ assertStringSplitSQL("AİB", "i\u0307", "UTF8_BINARY", array_AIB);
+ assertStringSplitSQL("AİB", "i\u0307", "UTF8_LCASE", array_A_B);
+ assertStringSplitSQL("AİB", "i\u0307", "UNICODE", array_AIB);
+ assertStringSplitSQL("AİB", "i\u0307", "UNICODE_CI", array_A_B);
+ assertStringSplitSQL("Ai\u0307B", "İ", "UTF8_BINARY", array_AiB);
+ assertStringSplitSQL("Ai\u0307B", "İ", "UTF8_LCASE", array_A_B);
+ assertStringSplitSQL("Ai\u0307B", "İ", "UNICODE", array_AiB);
+ assertStringSplitSQL("Ai\u0307B", "İ", "UNICODE_CI", array_A_B);
+ assertStringSplitSQL("Ai\u0307B", "i\u0307", "UTF8_BINARY", array_A_B);
+ assertStringSplitSQL("Ai\u0307B", "i\u0307", "UTF8_LCASE", array_A_B);
+ assertStringSplitSQL("Ai\u0307B", "i\u0307", "UNICODE", array_A_B);
+ assertStringSplitSQL("Ai\u0307B", "i\u0307", "UNICODE_CI", array_A_B);
+ // Conditional case mapping (e.g. Greek sigmas).
+ assertStringSplitSQL("σ", "σ", "UTF8_BINARY", full_match);
+ assertStringSplitSQL("σ", "σ", "UTF8_LCASE", full_match);
+ assertStringSplitSQL("σ", "σ", "UNICODE", full_match);
+ assertStringSplitSQL("σ", "σ", "UNICODE_CI", full_match);
+ assertStringSplitSQL("σ", "ς", "UTF8_BINARY", array_small_nonfinal_sigma);
+ assertStringSplitSQL("σ", "ς", "UTF8_LCASE", full_match);
+ assertStringSplitSQL("σ", "ς", "UNICODE", array_small_nonfinal_sigma);
+ assertStringSplitSQL("σ", "ς", "UNICODE_CI", full_match);
+ assertStringSplitSQL("σ", "Σ", "UTF8_BINARY", array_small_nonfinal_sigma);
+ assertStringSplitSQL("σ", "Σ", "UTF8_LCASE", full_match);
+ assertStringSplitSQL("σ", "Σ", "UNICODE", array_small_nonfinal_sigma);
+ assertStringSplitSQL("σ", "Σ", "UNICODE_CI", full_match);
+ assertStringSplitSQL("ς", "σ", "UTF8_BINARY", array_small_final_sigma);
+ assertStringSplitSQL("ς", "σ", "UTF8_LCASE", full_match);
+ assertStringSplitSQL("ς", "σ", "UNICODE", array_small_final_sigma);
+ assertStringSplitSQL("ς", "σ", "UNICODE_CI", full_match);
+ assertStringSplitSQL("ς", "ς", "UTF8_BINARY", full_match);
+ assertStringSplitSQL("ς", "ς", "UTF8_LCASE", full_match);
+ assertStringSplitSQL("ς", "ς", "UNICODE", full_match);
+ assertStringSplitSQL("ς", "ς", "UNICODE_CI", full_match);
+ assertStringSplitSQL("ς", "Σ", "UTF8_BINARY", array_small_final_sigma);
+ assertStringSplitSQL("ς", "Σ", "UTF8_LCASE", full_match);
+ assertStringSplitSQL("ς", "Σ", "UNICODE", array_small_final_sigma);
+ assertStringSplitSQL("ς", "Σ", "UNICODE_CI", full_match);
+ assertStringSplitSQL("Σ", "σ", "UTF8_BINARY", array_capital_sigma);
+ assertStringSplitSQL("Σ", "σ", "UTF8_LCASE", full_match);
+ assertStringSplitSQL("Σ", "σ", "UNICODE", array_capital_sigma);
+ assertStringSplitSQL("Σ", "σ", "UNICODE_CI", full_match);
+ assertStringSplitSQL("Σ", "ς", "UTF8_BINARY", array_capital_sigma);
+ assertStringSplitSQL("Σ", "ς", "UTF8_LCASE", full_match);
+ assertStringSplitSQL("Σ", "ς", "UNICODE", array_capital_sigma);
+ assertStringSplitSQL("Σ", "ς", "UNICODE_CI", full_match);
+ assertStringSplitSQL("Σ", "Σ", "UTF8_BINARY", full_match);
+ assertStringSplitSQL("Σ", "Σ", "UTF8_LCASE", full_match);
+ assertStringSplitSQL("Σ", "Σ", "UNICODE", full_match);
+ assertStringSplitSQL("Σ", "Σ", "UNICODE_CI", full_match);
+ // Surrogate pairs.
+ assertStringSplitSQL("a🙃b🙃c", "🙃", "UTF8_BINARY", array_a_b_c);
+ assertStringSplitSQL("a🙃b🙃c", "🙃", "UTF8_LCASE", array_a_b_c);
+ assertStringSplitSQL("a🙃b🙃c", "🙃", "UNICODE", array_a_b_c);
+ assertStringSplitSQL("a🙃b🙃c", "🙃", "UNICODE_CI", array_a_b_c);
+ assertStringSplitSQL("😀😆😃😄", "😆😃", "UTF8_BINARY", array_emojis);
+ assertStringSplitSQL("😀😆😃😄", "😆😃", "UTF8_LCASE", array_emojis);
+ assertStringSplitSQL("😀😆😃😄", "😆😃", "UNICODE", array_emojis);
+ assertStringSplitSQL("😀😆😃😄", "😆😃", "UNICODE_CI", array_emojis);
+ assertStringSplitSQL("A𐐅B", "𐐅", "UTF8_BINARY", array_A_B);
+ assertStringSplitSQL("A𐐅B", "𐐅", "UTF8_LCASE", array_A_B);
+ assertStringSplitSQL("A𐐅B", "𐐅", "UNICODE", array_A_B);
+ assertStringSplitSQL("A𐐅B", "𐐅", "UNICODE_CI", array_A_B);
+ assertStringSplitSQL("A𐐅B", "𐐭", "UTF8_BINARY", array_AOB);
+ assertStringSplitSQL("A𐐅B", "𐐭", "UTF8_LCASE", array_A_B);
+ assertStringSplitSQL("A𐐅B", "𐐭", "UNICODE", array_AOB);
+ assertStringSplitSQL("A𐐅B", "𐐭", "UNICODE_CI", array_A_B);
+ assertStringSplitSQL("A𐐭B", "𐐅", "UTF8_BINARY", array_AoB);
+ assertStringSplitSQL("A𐐭B", "𐐅", "UTF8_LCASE", array_A_B);
+ assertStringSplitSQL("A𐐭B", "𐐅", "UNICODE", array_AoB);
+ assertStringSplitSQL("A𐐭B", "𐐅", "UNICODE_CI", array_A_B);
}
+ /**
+ * Verify the behaviour of the `Upper` collation support class.
+ */
+
private void assertUpper(String target, String collationName, String expected)
- throws SparkException {
+ throws SparkException {
UTF8String target_utf8 = UTF8String.fromString(target);
UTF8String expected_utf8 = UTF8String.fromString(expected);
int collationId = CollationFactory.collationNameToId(collationName);
@@ -631,52 +1229,57 @@ private void assertUpper(String target, String collationName, String expected)
@Test
public void testUpper() throws SparkException {
- // Edge cases
- assertUpper("", "UTF8_BINARY", "");
- assertUpper("", "UTF8_LCASE", "");
- assertUpper("", "UNICODE", "");
- assertUpper("", "UNICODE_CI", "");
- // Basic tests
- assertUpper("abcde", "UTF8_BINARY", "ABCDE");
- assertUpper("abcde", "UTF8_LCASE", "ABCDE");
- assertUpper("abcde", "UNICODE", "ABCDE");
- assertUpper("abcde", "UNICODE_CI", "ABCDE");
- // Uppercase present
- assertUpper("AbCdE", "UTF8_BINARY", "ABCDE");
- assertUpper("aBcDe", "UTF8_BINARY", "ABCDE");
- assertUpper("AbCdE", "UTF8_LCASE", "ABCDE");
- assertUpper("aBcDe", "UTF8_LCASE", "ABCDE");
- assertUpper("AbCdE", "UNICODE", "ABCDE");
- assertUpper("aBcDe", "UNICODE", "ABCDE");
- assertUpper("AbCdE", "UNICODE_CI", "ABCDE");
- assertUpper("aBcDe", "UNICODE_CI", "ABCDE");
- // Accent letters
- assertUpper("aBćDe","UTF8_BINARY", "ABĆDE");
- assertUpper("aBćDe","UTF8_LCASE", "ABĆDE");
- assertUpper("aBćDe","UNICODE", "ABĆDE");
- assertUpper("aBćDe","UNICODE_CI", "ABĆDE");
- // Variable byte length characters
- assertUpper("ab世De", "UTF8_BINARY", "AB世DE");
- assertUpper("äbćδe", "UTF8_BINARY", "ÄBĆΔE");
- assertUpper("ab世De", "UTF8_LCASE", "AB世DE");
- assertUpper("äbćδe", "UTF8_LCASE", "ÄBĆΔE");
- assertUpper("ab世De", "UNICODE", "AB世DE");
- assertUpper("äbćδe", "UNICODE", "ÄBĆΔE");
- assertUpper("ab世De", "UNICODE_CI", "AB世DE");
- assertUpper("äbćδe", "UNICODE_CI", "ÄBĆΔE");
- // Case-variable character length
- assertUpper("i\u0307o", "UTF8_BINARY","I\u0307O");
- assertUpper("i\u0307o", "UTF8_LCASE","I\u0307O");
- assertUpper("i\u0307o", "UNICODE","I\u0307O");
- assertUpper("i\u0307o", "UNICODE_CI","I\u0307O");
- assertUpper("ß fi ffi ff st ῗ", "UTF8_BINARY","SS FI FFI FF ST \u0399\u0308\u0342");
- assertUpper("ß fi ffi ff st ῗ", "UTF8_LCASE","SS FI FFI FF ST \u0399\u0308\u0342");
- assertUpper("ß fi ffi ff st ῗ", "UNICODE","SS FI FFI FF ST \u0399\u0308\u0342");
- assertUpper("ß fi ffi ff st ῗ", "UNICODE","SS FI FFI FF ST \u0399\u0308\u0342");
+ for (String collationName: testSupportedCollations) {
+ // Empty strings.
+ assertUpper("", collationName, "");
+ // Basic tests.
+ assertUpper("abcde", collationName, "ABCDE");
+ assertUpper("AbCdE", collationName, "ABCDE");
+ assertUpper("aBcDe", collationName, "ABCDE");
+ assertUpper("ABCDE", collationName, "ABCDE");
+ // Advanced tests.
+ assertUpper("aBćDe", collationName, "ABĆDE");
+ assertUpper("ab世De", collationName, "AB世DE");
+ assertUpper("äbćδe", collationName, "ÄBĆΔE");
+ assertUpper("AbĆdE", collationName, "ABĆDE");
+ assertUpper("aB世De", collationName, "AB世DE");
+ assertUpper("ÄBĆΔE", collationName, "ÄBĆΔE");
+ // One-to-many case mapping (e.g. Turkish dotted I).
+ assertUpper("İ", collationName, "İ");
+ assertUpper("i\u0307", collationName,"I\u0307");
+ assertUpper("İonic", collationName, "İONIC");
+ assertUpper("i\u0307onic", collationName,"I\u0307ONIC");
+ assertUpper("FIDELİO", collationName, "FIDELİO");
+ // Conditional case mapping (e.g. Greek sigmas).
+ assertUpper("σ", collationName, "Σ");
+ assertUpper("σ", collationName, "Σ");
+ assertUpper("ς", collationName, "Σ");
+ assertUpper("Σ", collationName, "Σ");
+ assertUpper("ΣΑΛΑΤΑ", collationName, "ΣΑΛΑΤΑ");
+ assertUpper("σαλατα", collationName, "ΣΑΛΑΤΑ");
+ assertUpper("ςαλατα", collationName, "ΣΑΛΑΤΑ");
+ assertUpper("ΘΑΛΑΣΣΙΝΟΣ", collationName, "ΘΑΛΑΣΣΙΝΟΣ");
+ assertUpper("θαλασσινοσ", collationName, "ΘΑΛΑΣΣΙΝΟΣ");
+ assertUpper("θαλασσινος", collationName, "ΘΑΛΑΣΣΙΝΟΣ");
+ // Surrogate pairs.
+ assertUpper("a🙃B🙃c", collationName, "A🙃B🙃C");
+ assertUpper("😄 😆", collationName, "😄 😆");
+ assertUpper("😀😆😃😄", collationName, "😀😆😃😄");
+ assertUpper("𝔸", collationName, "𝔸");
+ assertUpper("𐐅", collationName, "𐐅");
+ assertUpper("𐐭", collationName, "𐐅");
+ assertUpper("𐐭𝔸", collationName, "𐐅𝔸");
+ // Ligatures.
+ assertUpper("ß fi ffi ff st ῗ", collationName,"SS FI FFI FF ST \u0399\u0308\u0342");
+ }
}
+ /**
+ * Verify the behaviour of the `Lower` collation support class.
+ */
+
private void assertLower(String target, String collationName, String expected)
- throws SparkException {
+ throws SparkException {
UTF8String target_utf8 = UTF8String.fromString(target);
UTF8String expected_utf8 = UTF8String.fromString(expected);
int collationId = CollationFactory.collationNameToId(collationName);
@@ -689,48 +1292,56 @@ private void assertLower(String target, String collationName, String expected)
@Test
public void testLower() throws SparkException {
- // Edge cases
- assertLower("", "UTF8_BINARY", "");
- assertLower("", "UTF8_LCASE", "");
- assertLower("", "UNICODE", "");
- assertLower("", "UNICODE_CI", "");
- // Basic tests
- assertLower("ABCDE", "UTF8_BINARY", "abcde");
- assertLower("ABCDE", "UTF8_LCASE", "abcde");
- assertLower("ABCDE", "UNICODE", "abcde");
- assertLower("ABCDE", "UNICODE_CI", "abcde");
- // Uppercase present
- assertLower("AbCdE", "UTF8_BINARY", "abcde");
- assertLower("aBcDe", "UTF8_BINARY", "abcde");
- assertLower("AbCdE", "UTF8_LCASE", "abcde");
- assertLower("aBcDe", "UTF8_LCASE", "abcde");
- assertLower("AbCdE", "UNICODE", "abcde");
- assertLower("aBcDe", "UNICODE", "abcde");
- assertLower("AbCdE", "UNICODE_CI", "abcde");
- assertLower("aBcDe", "UNICODE_CI", "abcde");
- // Accent letters
- assertLower("AbĆdE","UTF8_BINARY", "abćde");
- assertLower("AbĆdE","UTF8_LCASE", "abćde");
- assertLower("AbĆdE","UNICODE", "abćde");
- assertLower("AbĆdE","UNICODE_CI", "abćde");
- // Variable byte length characters
- assertLower("aB世De", "UTF8_BINARY", "ab世de");
- assertLower("ÄBĆΔE", "UTF8_BINARY", "äbćδe");
- assertLower("aB世De", "UTF8_LCASE", "ab世de");
- assertLower("ÄBĆΔE", "UTF8_LCASE", "äbćδe");
- assertLower("aB世De", "UNICODE", "ab世de");
- assertLower("ÄBĆΔE", "UNICODE", "äbćδe");
- assertLower("aB世De", "UNICODE_CI", "ab世de");
- assertLower("ÄBĆΔE", "UNICODE_CI", "äbćδe");
- // Case-variable character length
- assertLower("İo", "UTF8_BINARY","i\u0307o");
- assertLower("İo", "UTF8_LCASE","i\u0307o");
- assertLower("İo", "UNICODE","i\u0307o");
- assertLower("İo", "UNICODE_CI","i\u0307o");
+ for (String collationName: testSupportedCollations) {
+ // Empty strings.
+ assertLower("", collationName, "");
+ // Basic tests.
+ assertLower("abcde", collationName, "abcde");
+ assertLower("AbCdE", collationName, "abcde");
+ assertLower("aBcDe", collationName, "abcde");
+ assertLower("ABCDE", collationName, "abcde");
+ // Advanced tests.
+ assertUpper("aBćDe", collationName, "ABĆDE");
+ assertUpper("ab世De", collationName, "AB世DE");
+ assertUpper("äbćδe", collationName, "ÄBĆΔE");
+ assertLower("AbĆdE", collationName, "abćde");
+ assertLower("aB世De", collationName, "ab世de");
+ assertLower("ÄBĆΔE", collationName, "äbćδe");
+ // One-to-many case mapping (e.g. Turkish dotted I).
+ assertLower("İ", collationName, "i\u0307");
+ assertLower("I\u0307", collationName,"i\u0307");
+ assertLower("İonic", collationName, "i\u0307onic");
+ assertLower("i\u0307onic", collationName,"i\u0307onic");
+ assertLower("FIDELİO", collationName, "fideli\u0307o");
+ // Conditional case mapping (e.g. Greek sigmas).
+ assertLower("σ", collationName, "σ");
+ assertLower("ς", collationName, "ς");
+ assertLower("Σ", collationName, "σ");
+ assertLower("ΣΑΛΑΤΑ", collationName, "σαλατα");
+ assertLower("σαλατα", collationName, "σαλατα");
+ assertLower("ςαλατα", collationName, "ςαλατα");
+ assertLower("ΘΑΛΑΣΣΙΝΟΣ", collationName, "θαλασσινος");
+ assertLower("θαλασσινοσ", collationName, "θαλασσινοσ");
+ assertLower("θαλασσινος", collationName, "θαλασσινος");
+ // Surrogate pairs.
+ assertLower("a🙃B🙃c", collationName, "a🙃b🙃c");
+ assertLower("😄 😆", collationName, "😄 😆");
+ assertLower("😀😆😃😄", collationName, "😀😆😃😄");
+ assertLower("𝔸", collationName, "𝔸");
+ assertLower("𐐅", collationName, "𐐭");
+ assertLower("𐐭", collationName, "𐐭");
+ assertLower("𐐭𝔸", collationName, "𐐭𝔸");
+ // Ligatures.
+ assertLower("ß fi ffi ff st ῗ", collationName,"ß fi ffi ff st ῗ");
+ }
}
+ /**
+ * Verify the behaviour of the `InitCap` collation support class.
+ */
+
private void assertInitCap(String target, String collationName, String expected)
- throws SparkException {
+ throws SparkException {
UTF8String target_utf8 = UTF8String.fromString(target);
UTF8String expected_utf8 = UTF8String.fromString(expected);
int collationId = CollationFactory.collationNameToId(collationName);
@@ -741,43 +1352,65 @@ private void assertInitCap(String target, String collationName, String expected)
// Note: results should be the same in these tests for both ICU and JVM-based implementations.
}
+ private void assertInitCap(
+ String target,
+ String collationName,
+ String expectedICU,
+ String expectedNonICU) throws SparkException {
+ UTF8String target_utf8 = UTF8String.fromString(target);
+ UTF8String expectedICU_utf8 = UTF8String.fromString(expectedICU);
+ UTF8String expectedNonICU_utf8 = UTF8String.fromString(expectedNonICU);
+ int collationId = CollationFactory.collationNameToId(collationName);
+ // Testing the new ICU-based implementation of the Lower function.
+ assertEquals(expectedICU_utf8, CollationSupport.InitCap.exec(target_utf8, collationId, true));
+ // Testing the old JVM-based implementation of the Lower function.
+ assertEquals(expectedNonICU_utf8, CollationSupport.InitCap.exec(target_utf8, collationId,
+ false));
+ // Note: results should be the same in these tests for both ICU and JVM-based implementations.
+ }
+
@Test
public void testInitCap() throws SparkException {
- // Edge cases
- assertInitCap("", "UTF8_BINARY", "");
- assertInitCap("", "UTF8_LCASE", "");
- assertInitCap("", "UNICODE", "");
- assertInitCap("", "UNICODE_CI", "");
- // Basic tests
- assertInitCap("ABCDE", "UTF8_BINARY", "Abcde");
- assertInitCap("ABCDE", "UTF8_LCASE", "Abcde");
- assertInitCap("ABCDE", "UNICODE", "Abcde");
- assertInitCap("ABCDE", "UNICODE_CI", "Abcde");
- // Uppercase present
- assertInitCap("AbCdE", "UTF8_BINARY", "Abcde");
- assertInitCap("aBcDe", "UTF8_BINARY", "Abcde");
- assertInitCap("AbCdE", "UTF8_LCASE", "Abcde");
- assertInitCap("aBcDe", "UTF8_LCASE", "Abcde");
- assertInitCap("AbCdE", "UNICODE", "Abcde");
- assertInitCap("aBcDe", "UNICODE", "Abcde");
- assertInitCap("AbCdE", "UNICODE_CI", "Abcde");
- assertInitCap("aBcDe", "UNICODE_CI", "Abcde");
- // Accent letters
- assertInitCap("AbĆdE", "UTF8_BINARY", "Abćde");
- assertInitCap("AbĆdE", "UTF8_LCASE", "Abćde");
- assertInitCap("AbĆdE", "UNICODE", "Abćde");
- assertInitCap("AbĆdE", "UNICODE_CI", "Abćde");
- // Variable byte length characters
- assertInitCap("aB 世 De", "UTF8_BINARY", "Ab 世 De");
+ for (String collationName: testSupportedCollations) {
+ // Empty strings.
+ assertInitCap("", collationName, "");
+ // Basic tests.
+ assertInitCap("abcde", collationName, "Abcde");
+ assertInitCap("AbCdE", collationName, "Abcde");
+ assertInitCap("aBcDe", collationName, "Abcde");
+ assertInitCap("ABCDE", collationName, "Abcde");
+ // Conditional case mapping (e.g. Greek sigmas).
+ assertInitCap("σ", collationName, "Σ");
+ assertInitCap("ς", collationName, "Σ");
+ assertInitCap("Σ", collationName, "Σ");
+ assertInitCap("ΣΑΛΑΤΑ", collationName, "Σαλατα");
+ assertInitCap("σαλατα", collationName, "Σαλατα");
+ assertInitCap("ςαλατα", collationName, "Σαλατα");
+ assertInitCap("ΘΑΛΑΣΣΙΝΟΣ", collationName, "Θαλασσινος");
+ assertInitCap("θαλασσινοσ", collationName, "Θαλασσινοσ");
+ assertInitCap("θαλασσινος", collationName, "Θαλασσινος");
+ }
+ // Advanced tests.
+ assertInitCap("aBćDe", "UTF8_BINARY", "Abćde");
+ assertInitCap("aBćDe", "UTF8_LCASE", "Abćde");
+ assertInitCap("aBćDe", "UNICODE", "Abćde");
+ assertInitCap("aBćDe", "UNICODE_CI", "Abćde");
+ assertInitCap("ab世De", "UTF8_BINARY", "Ab世de");
+ assertInitCap("ab世De", "UTF8_LCASE", "Ab世De");
+ assertInitCap("ab世De", "UNICODE", "Ab世De");
+ assertInitCap("ab世De", "UNICODE_CI", "Ab世De");
+ assertInitCap("äbćδe", "UTF8_BINARY", "Äbćδe");
+ assertInitCap("äbćδe", "UTF8_LCASE", "Äbćδe");
+ assertInitCap("äbćδe", "UNICODE", "Äbćδe");
+ assertInitCap("äbćδe", "UNICODE_CI", "Äbćδe");
assertInitCap("ÄBĆΔE", "UTF8_BINARY", "Äbćδe");
- assertInitCap("aB 世 De", "UTF8_LCASE", "Ab 世 De");
assertInitCap("ÄBĆΔE", "UTF8_LCASE", "Äbćδe");
- assertInitCap("aB 世 De", "UNICODE", "Ab 世 De");
assertInitCap("ÄBĆΔE", "UNICODE", "Äbćδe");
- assertInitCap("aB 世 de", "UNICODE_CI", "Ab 世 De");
assertInitCap("ÄBĆΔE", "UNICODE_CI", "Äbćδe");
+ assertInitCap("êéfgh", "AF_CI_AI", "Êéfgh");
+ assertInitCap("öoAÄ", "DE_CI_AI", "Öoaä");
// Case-variable character length
- assertInitCap("İo", "UTF8_BINARY", "I\u0307o");
+ assertInitCap("İo", "UTF8_BINARY", "İo", "I\u0307o");
assertInitCap("İo", "UTF8_LCASE", "İo");
assertInitCap("İo", "UNICODE", "İo");
assertInitCap("İo", "UNICODE_CI", "İo");
@@ -786,6 +1419,67 @@ public void testInitCap() throws SparkException {
assertInitCap("i\u0307o", "UNICODE", "I\u0307o");
assertInitCap("i\u0307o", "UNICODE_CI", "I\u0307o");
// Different possible word boundaries
+ assertInitCap("aB 世 de", "UTF8_BINARY", "Ab 世 De");
+ assertInitCap("aB 世 de", "UTF8_LCASE", "Ab 世 De");
+ assertInitCap("aB 世 de", "UNICODE", "Ab 世 De");
+ assertInitCap("aB 世 de", "UNICODE_CI", "Ab 世 De");
+ // One-to-many case mapping (e.g. Turkish dotted I).
+ assertInitCap("İ", "UTF8_BINARY", "İ", "I\u0307");
+ assertInitCap("İ", "UTF8_LCASE", "İ");
+ assertInitCap("İ", "UNICODE", "İ");
+ assertInitCap("İ", "UNICODE_CI", "İ");
+ assertInitCap("I\u0307", "UTF8_BINARY","I\u0307");
+ assertInitCap("I\u0307", "UTF8_LCASE","I\u0307");
+ assertInitCap("I\u0307", "UNICODE","I\u0307");
+ assertInitCap("I\u0307", "UNICODE_CI","I\u0307");
+ assertInitCap("İonic", "UTF8_BINARY", "İonic", "I\u0307onic");
+ assertInitCap("İonic", "UTF8_LCASE", "İonic");
+ assertInitCap("İonic", "UNICODE", "İonic");
+ assertInitCap("İonic", "UNICODE_CI", "İonic");
+ assertInitCap("i\u0307onic", "UTF8_BINARY","I\u0307onic");
+ assertInitCap("i\u0307onic", "UTF8_LCASE","I\u0307onic");
+ assertInitCap("i\u0307onic", "UNICODE","I\u0307onic");
+ assertInitCap("i\u0307onic", "UNICODE_CI","I\u0307onic");
+ assertInitCap("FIDELİO", "UTF8_BINARY", "Fideli\u0307o");
+ assertInitCap("FIDELİO", "UTF8_LCASE", "Fideli\u0307o");
+ assertInitCap("FIDELİO", "UNICODE", "Fideli\u0307o");
+ assertInitCap("FIDELİO", "UNICODE_CI", "Fideli\u0307o");
+ // Surrogate pairs.
+ assertInitCap("a🙃B🙃c", "UTF8_BINARY", "A🙃b🙃c");
+ assertInitCap("a🙃B🙃c", "UTF8_LCASE", "A🙃B🙃C");
+ assertInitCap("a🙃B🙃c", "UNICODE", "A🙃B🙃C");
+ assertInitCap("a🙃B🙃c", "UNICODE_CI", "A🙃B🙃C");
+ assertInitCap("😄 😆", "UTF8_BINARY", "😄 😆");
+ assertInitCap("😄 😆", "UTF8_LCASE", "😄 😆");
+ assertInitCap("😄 😆", "UNICODE", "😄 😆");
+ assertInitCap("😄 😆", "UNICODE_CI", "😄 😆");
+ assertInitCap("😀😆😃😄", "UTF8_BINARY", "😀😆😃😄");
+ assertInitCap("😀😆😃😄", "UTF8_LCASE", "😀😆😃😄");
+ assertInitCap("😀😆😃😄", "UNICODE", "😀😆😃😄");
+ assertInitCap("😀😆😃😄", "UNICODE_CI", "😀😆😃😄");
+ assertInitCap("𝔸", "UTF8_BINARY", "𝔸");
+ assertInitCap("𝔸", "UTF8_LCASE", "𝔸");
+ assertInitCap("𝔸", "UNICODE", "𝔸");
+ assertInitCap("𝔸", "UNICODE_CI", "𝔸");
+ assertInitCap("𐐅", "UTF8_BINARY", "\uD801\uDC05", "𐐭");
+ assertInitCap("𐐅", "UTF8_LCASE", "𐐅");
+ assertInitCap("𐐅", "UNICODE", "𐐅");
+ assertInitCap("𐐅", "UNICODE_CI", "𐐅");
+ assertInitCap("𐐭", "UTF8_BINARY", "\uD801\uDC05", "𐐭");
+ assertInitCap("𐐭", "UTF8_LCASE", "𐐅");
+ assertInitCap("𐐭", "UNICODE", "𐐅");
+ assertInitCap("𐐭", "UNICODE_CI", "𐐅");
+ assertInitCap("𐐭𝔸", "UTF8_BINARY", "\uD801\uDC05\uD835\uDD38", "𐐭𝔸");
+ assertInitCap("𐐭𝔸", "UTF8_LCASE", "𐐅𝔸");
+ assertInitCap("𐐭𝔸", "UNICODE", "𐐅𝔸");
+ assertInitCap("𐐭𝔸", "UNICODE_CI", "𐐅𝔸");
+ // Ligatures.
+ assertInitCap("ß fi ffi ff st ῗ", "UTF8_BINARY", "Ss Fi Ffi Ff St Ϊ͂", "ß fi ffi ff st ῗ");
+ assertInitCap("ß fi ffi ff st ῗ", "UTF8_LCASE", "Ss Fi Ffi Ff St \u0399\u0308\u0342");
+ assertInitCap("ß fi ffi ff st ῗ", "UNICODE", "Ss Fi Ffi Ff St \u0399\u0308\u0342");
+ assertInitCap("ß fi ffi ff st ῗ", "UNICODE", "Ss Fi Ffi Ff St \u0399\u0308\u0342");
+ assertInitCap("œ ǽ", "UTF8_BINARY", "Œ Ǽ", "Œ Ǽ");
+ // Different possible word boundaries.
assertInitCap("a b c", "UTF8_BINARY", "A B C");
assertInitCap("a b c", "UNICODE", "A B C");
assertInitCap("a b c", "UTF8_LCASE", "A B C");
@@ -802,7 +1496,7 @@ public void testInitCap() throws SparkException {
assertInitCap("a?b世c", "UNICODE", "A?B世C");
assertInitCap("a?b世c", "UTF8_LCASE", "A?B世C");
assertInitCap("a?b世c", "UNICODE_CI", "A?B世C");
- // Titlecase characters that are different from uppercase characters
+ // Titlecase characters that are different from uppercase characters.
assertInitCap("dzDZDz", "UTF8_BINARY", "Dzdzdz");
assertInitCap("dzDZDz", "UNICODE", "Dzdzdz");
assertInitCap("dzDZDz", "UTF8_LCASE", "Dzdzdz");
@@ -812,17 +1506,50 @@ public void testInitCap() throws SparkException {
assertInitCap("džaba Ljubav NJegova", "UTF8_LCASE", "Džaba Ljubav Njegova");
assertInitCap("džaba Ljubav NJegova", "UNICODE_CI", "Džaba Ljubav Njegova");
assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UTF8_BINARY",
- "ß fi ffi ff st Σημερινος Ασημενιος I\u0307ota");
+ "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota","ß fi ffi ff st Σημερινος Ασημενιος I\u0307ota");
assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UTF8_LCASE",
"Ss Fi Ffi Ff St Σημερινος Ασημενιος İota");
assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE",
"Ss Fi Ffi Ff St Σημερινος Ασημενιος İota");
- assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE_CI",
- "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota");
+ assertInitCap("ß fi ffi ff st ΣΗΜΕΡςΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE_CI",
+ "Ss Fi Ffi Ff St Σημερςινος Ασημενιος İota");
+ // Characters that map to multiple characters when titlecased and lowercased.
+ assertInitCap("ß fi ffi ff st İOTA", "UTF8_BINARY", "Ss Fi Ffi Ff St İota", "ß fi ffi ff st İota");
+ assertInitCap("ß fi ffi ff st OİOTA", "UTF8_BINARY",
+ "Ss Fi Ffi Ff St Oi\u0307ota", "ß fi ffi ff st Oi̇ota");
+ // Lowercasing Greek letter sigma ('Σ') when case-ignorable character present.
+ assertInitCap("`Σ", "UTF8_BINARY", "`σ", "`σ");
+ assertInitCap("1`Σ`` AΣ", "UTF8_BINARY", "1`σ`` Aς", "1`σ`` Aς");
+ assertInitCap("a1`Σ``", "UTF8_BINARY", "A1`σ``", "A1`σ``");
+ assertInitCap("a`Σ``", "UTF8_BINARY", "A`ς``", "A`σ``");
+ assertInitCap("a`Σ``1", "UTF8_BINARY", "A`ς``1", "A`σ``1");
+ assertInitCap("a`Σ``A", "UTF8_BINARY", "A`σ``a", "A`σ``a");
+ assertInitCap("ΘΑ�Σ�ΟΣ�", "UTF8_BINARY", "Θα�σ�ος�", "Θα�σ�ος�");
+ assertInitCap("ΘΑᵩΣ�ΟᵩΣᵩ�", "UTF8_BINARY", "Θαᵩς�οᵩςᵩ�", "Θαᵩς�οᵩςᵩ�");
+ assertInitCap("ΘΑ�ᵩΣ�ΟᵩΣᵩ�", "UTF8_BINARY", "Θα�ᵩσ�οᵩςᵩ�", "Θα�ᵩσ�οᵩςᵩ�");
+ assertInitCap("ΘΑ�ᵩΣᵩ�ΟᵩΣᵩ�", "UTF8_BINARY", "Θα�ᵩσᵩ�οᵩςᵩ�", "Θα�ᵩσᵩ�οᵩςᵩ�");
+ assertInitCap("ΘΑ�Σ�Ο�Σ�", "UTF8_BINARY", "Θα�σ�ο�σ�", "Θα�σ�ο�σ�");
+ // Disallowed bytes and invalid sequences.
+ assertInitCap(UTF8String.fromBytes(new byte[] { (byte)0xC0, (byte)0xC1, (byte)0xF5}).toString(),
+ "UTF8_BINARY", "���", "���");
+ assertInitCap(UTF8String.fromBytes(
+ new byte[]{(byte)0xC0, (byte)0xC1, (byte)0xF5, 0x20, 0x61, 0x41, (byte)0xC0}).toString(),
+ "UTF8_BINARY",
+ "��� Aa�", "��� Aa�");
+ assertInitCap(UTF8String.fromBytes(new byte[]{(byte)0xC2,(byte)0xC2}).toString(),
+ "UTF8_BINARY", "��", "��");
+ assertInitCap(UTF8String.fromBytes(
+ new byte[]{0x61, 0x41, (byte)0xC2, (byte)0xC2, 0x41}).toString(),
+ "UTF8_BINARY",
+ "Aa��a", "Aa��a");
}
- private void assertStringInstr(String string, String substring, String collationName,
- Integer expected) throws SparkException {
+ /**
+ * Verify the behaviour of the `StringInstr` collation support class.
+ */
+
+ private void assertStringInstr(String string, String substring,
+ String collationName, int expected) throws SparkException {
UTF8String str = UTF8String.fromString(string);
UTF8String substr = UTF8String.fromString(substring);
int collationId = CollationFactory.collationNameToId(collationName);
@@ -831,143 +1558,402 @@ private void assertStringInstr(String string, String substring, String collation
@Test
public void testStringInstr() throws SparkException {
- assertStringInstr("aaads", "Aa", "UTF8_BINARY", 0);
- assertStringInstr("aaaDs", "de", "UTF8_BINARY", 0);
+ // Empty strings.
+ assertStringInstr("", "", "UTF8_BINARY", 1);
+ assertStringInstr("", "", "UTF8_LCASE", 1);
+ assertStringInstr("", "", "UNICODE_CI", 1);
+ assertStringInstr("", "", "UNICODE", 1);
+ assertStringInstr("a", "", "UTF8_BINARY", 1);
+ assertStringInstr("a", "", "UTF8_LCASE", 1);
+ assertStringInstr("a", "", "UNICODE", 1);
+ assertStringInstr("a", "", "UNICODE_CI", 1);
+ assertStringInstr("", "x", "UTF8_BINARY", 0);
+ assertStringInstr("", "x", "UTF8_LCASE", 0);
+ assertStringInstr("", "x", "UNICODE", 0);
+ assertStringInstr("", "x", "UNICODE_CI", 0);
+ // Basic tests.
+ assertStringInstr("aaads", "aa", "UTF8_BINARY", 1);
+ assertStringInstr("aaads", "aa", "UTF8_LCASE", 1);
+ assertStringInstr("aaads", "aa", "UNICODE", 1);
+ assertStringInstr("aaads", "aa", "UNICODE_CI", 1);
assertStringInstr("aaads", "ds", "UTF8_BINARY", 4);
- assertStringInstr("xxxx", "", "UTF8_BINARY", 1);
- assertStringInstr("", "xxxx", "UTF8_BINARY", 0);
- assertStringInstr("test大千世界X大千世界", "大千", "UTF8_BINARY", 5);
- assertStringInstr("test大千世界X大千世界", "界X", "UTF8_BINARY", 8);
+ assertStringInstr("aaads", "ds", "UTF8_LCASE", 4);
+ assertStringInstr("aaads", "ds", "UNICODE", 4);
+ assertStringInstr("aaads", "ds", "UNICODE_CI", 4);
+ assertStringInstr("aaads", "Aa", "UTF8_BINARY", 0);
assertStringInstr("aaads", "Aa", "UTF8_LCASE", 1);
+ assertStringInstr("aaads", "Aa", "UNICODE", 0);
+ assertStringInstr("aaads", "Aa", "UNICODE_CI", 1);
+ assertStringInstr("aaaDs", "de", "UTF8_BINARY", 0);
assertStringInstr("aaaDs", "de", "UTF8_LCASE", 0);
+ assertStringInstr("aaaDs", "de", "UNICODE", 0);
+ assertStringInstr("aaaDs", "de", "UNICODE_CI", 0);
+ assertStringInstr("aaaDs", "ds", "UTF8_BINARY", 0);
assertStringInstr("aaaDs", "ds", "UTF8_LCASE", 4);
- assertStringInstr("xxxx", "", "UTF8_LCASE", 1);
- assertStringInstr("", "xxxx", "UTF8_LCASE", 0);
+ assertStringInstr("aaaDs", "ds", "UNICODE", 0);
+ assertStringInstr("aaaDs", "ds", "UNICODE_CI", 4);
+ assertStringInstr("aaadS", "Ds", "UTF8_BINARY", 0);
+ assertStringInstr("aaadS", "Ds", "UTF8_LCASE", 4);
+ assertStringInstr("aaadS", "Ds", "UNICODE", 0);
+ assertStringInstr("aaadS", "Ds", "UNICODE_CI", 4);
+ assertStringInstr("aaaČŠčšcs", "cs", "SR", 8);
+ assertStringInstr("aaaČŠčšcs", "cs", "SR_CI_AI", 4);
+ // Advanced tests.
+ assertStringInstr("test大千世界X大千世界", "大千", "UTF8_BINARY", 5);
assertStringInstr("test大千世界X大千世界", "大千", "UTF8_LCASE", 5);
+ assertStringInstr("test大千世界X大千世界", "大千", "UNICODE", 5);
+ assertStringInstr("test大千世界X大千世界", "大千", "UNICODE_CI", 5);
+ assertStringInstr("test大千世界X大千世界", "界X", "UTF8_BINARY", 8);
+ assertStringInstr("test大千世界X大千世界", "界X", "UTF8_LCASE", 8);
+ assertStringInstr("test大千世界X大千世界", "界X", "UNICODE", 8);
+ assertStringInstr("test大千世界X大千世界", "界X", "UNICODE_CI", 8);
+ assertStringInstr("test大千世界X大千世界", "界x", "UTF8_BINARY", 0);
assertStringInstr("test大千世界X大千世界", "界x", "UTF8_LCASE", 8);
- assertStringInstr("aaads", "Aa", "UNICODE", 0);
- assertStringInstr("aaads", "aa", "UNICODE", 1);
- assertStringInstr("aaads", "de", "UNICODE", 0);
- assertStringInstr("xxxx", "", "UNICODE", 1);
- assertStringInstr("", "xxxx", "UNICODE", 0);
assertStringInstr("test大千世界X大千世界", "界x", "UNICODE", 0);
- assertStringInstr("test大千世界X大千世界", "界X", "UNICODE", 8);
- assertStringInstr("xxxx", "", "UNICODE_CI", 1);
- assertStringInstr("", "xxxx", "UNICODE_CI", 0);
- assertStringInstr("aaads", "AD", "UNICODE_CI", 3);
- assertStringInstr("aaads", "dS", "UNICODE_CI", 4);
- assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0);
assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8);
- assertStringInstr("i̇", "i", "UNICODE_CI", 0);
- assertStringInstr("i̇", "\u0307", "UNICODE_CI", 0);
- assertStringInstr("i̇", "İ", "UNICODE_CI", 1);
+ assertStringInstr("test大千世界X大千世界", "界y", "UTF8_BINARY", 0);
+ assertStringInstr("test大千世界X大千世界", "界y", "UTF8_LCASE", 0);
+ assertStringInstr("test大千世界X大千世界", "界y", "UNICODE", 0);
+ assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0);
+ // One-to-many case mapping (e.g. Turkish dotted I).
+ assertStringInstr("i\u0307", "i", "UNICODE_CI", 0);
+ assertStringInstr("i\u0307", "\u0307", "UNICODE_CI", 0);
+ assertStringInstr("i\u0307", "İ", "UNICODE_CI", 1);
assertStringInstr("İ", "i", "UNICODE_CI", 0);
- assertStringInstr("İoi̇o12", "i̇o", "UNICODE_CI", 1);
+ assertStringInstr("İoi̇o12", "i\u0307o", "UNICODE_CI", 1);
assertStringInstr("i̇oİo12", "İo", "UNICODE_CI", 1);
- assertStringInstr("abİoi̇o", "i̇o", "UNICODE_CI", 3);
+ assertStringInstr("abİoi̇o", "i\u0307o", "UNICODE_CI", 3);
assertStringInstr("abi̇oİo", "İo", "UNICODE_CI", 3);
assertStringInstr("ai̇oxXİo", "Xx", "UNICODE_CI", 5);
assertStringInstr("aİoi̇oxx", "XX", "UNICODE_CI", 7);
- assertStringInstr("i̇", "i", "UTF8_LCASE", 1); // != UNICODE_CI
- assertStringInstr("i̇", "\u0307", "UTF8_LCASE", 2); // != UNICODE_CI
- assertStringInstr("i̇", "İ", "UTF8_LCASE", 1);
+ assertStringInstr("i\u0307", "i", "UTF8_LCASE", 1); // != UNICODE_CI
+ assertStringInstr("i\u0307", "\u0307", "UTF8_LCASE", 2); // != UNICODE_CI
+ assertStringInstr("i\u0307", "İ", "UTF8_LCASE", 1);
assertStringInstr("İ", "i", "UTF8_LCASE", 0);
- assertStringInstr("İoi̇o12", "i̇o", "UTF8_LCASE", 1);
+ assertStringInstr("İoi̇o12", "i\u0307o", "UTF8_LCASE", 1);
assertStringInstr("i̇oİo12", "İo", "UTF8_LCASE", 1);
- assertStringInstr("abİoi̇o", "i̇o", "UTF8_LCASE", 3);
+ assertStringInstr("abİoi̇o", "i\u0307o", "UTF8_LCASE", 3);
assertStringInstr("abi̇oİo", "İo", "UTF8_LCASE", 3);
assertStringInstr("abI\u0307oi̇o", "İo", "UTF8_LCASE", 3);
assertStringInstr("ai̇oxXİo", "Xx", "UTF8_LCASE", 5);
assertStringInstr("abİoi̇o", "\u0307o", "UTF8_LCASE", 6);
assertStringInstr("aİoi̇oxx", "XX", "UTF8_LCASE", 7);
+ // Conditional case mapping (e.g. Greek sigmas).
+ assertStringInstr("σ", "σ", "UTF8_BINARY", 1);
+ assertStringInstr("σ", "ς", "UTF8_BINARY", 0);
+ assertStringInstr("σ", "Σ", "UTF8_BINARY", 0);
+ assertStringInstr("ς", "σ", "UTF8_BINARY", 0);
+ assertStringInstr("ς", "ς", "UTF8_BINARY", 1);
+ assertStringInstr("ς", "Σ", "UTF8_BINARY", 0);
+ assertStringInstr("Σ", "σ", "UTF8_BINARY", 0);
+ assertStringInstr("Σ", "ς", "UTF8_BINARY", 0);
+ assertStringInstr("Σ", "Σ", "UTF8_BINARY", 1);
+ assertStringInstr("σ", "σ", "UTF8_LCASE", 1);
+ assertStringInstr("σ", "ς", "UTF8_LCASE", 1);
+ assertStringInstr("σ", "Σ", "UTF8_LCASE", 1);
+ assertStringInstr("ς", "σ", "UTF8_LCASE", 1);
+ assertStringInstr("ς", "ς", "UTF8_LCASE", 1);
+ assertStringInstr("ς", "Σ", "UTF8_LCASE", 1);
+ assertStringInstr("Σ", "σ", "UTF8_LCASE", 1);
+ assertStringInstr("Σ", "ς", "UTF8_LCASE", 1);
+ assertStringInstr("Σ", "Σ", "UTF8_LCASE", 1);
+ assertStringInstr("σ", "σ", "UNICODE", 1);
+ assertStringInstr("σ", "ς", "UNICODE", 0);
+ assertStringInstr("σ", "Σ", "UNICODE", 0);
+ assertStringInstr("ς", "σ", "UNICODE", 0);
+ assertStringInstr("ς", "ς", "UNICODE", 1);
+ assertStringInstr("ς", "Σ", "UNICODE", 0);
+ assertStringInstr("Σ", "σ", "UNICODE", 0);
+ assertStringInstr("Σ", "ς", "UNICODE", 0);
+ assertStringInstr("Σ", "Σ", "UNICODE", 1);
+ assertStringInstr("σ", "σ", "UNICODE_CI", 1);
+ assertStringInstr("σ", "ς", "UNICODE_CI", 1);
+ assertStringInstr("σ", "Σ", "UNICODE_CI", 1);
+ assertStringInstr("ς", "σ", "UNICODE_CI", 1);
+ assertStringInstr("ς", "ς", "UNICODE_CI", 1);
+ assertStringInstr("ς", "Σ", "UNICODE_CI", 1);
+ assertStringInstr("Σ", "σ", "UNICODE_CI", 1);
+ assertStringInstr("Σ", "ς", "UNICODE_CI", 1);
+ assertStringInstr("Σ", "Σ", "UNICODE_CI", 1);
+ // Surrogate pairs.
+ assertStringInstr("a🙃b", "a", "UTF8_BINARY", 1);
+ assertStringInstr("a🙃b", "a", "UTF8_LCASE", 1);
+ assertStringInstr("a🙃b", "a", "UNICODE", 1);
+ assertStringInstr("a🙃b", "a", "UNICODE_CI", 1);
+ assertStringInstr("a🙃b", "🙃", "UTF8_BINARY", 2);
+ assertStringInstr("a🙃b", "🙃", "UTF8_LCASE", 2);
+ assertStringInstr("a🙃b", "🙃", "UNICODE", 2);
+ assertStringInstr("a🙃b", "🙃", "UNICODE_CI", 2);
+ assertStringInstr("a🙃b", "b", "UTF8_BINARY", 3);
+ assertStringInstr("a🙃b", "b", "UTF8_LCASE", 3);
+ assertStringInstr("a🙃b", "b", "UNICODE", 3);
+ assertStringInstr("a🙃b", "b", "UNICODE_CI", 3);
+ assertStringInstr("a🙃🙃b", "🙃", "UTF8_BINARY", 2);
+ assertStringInstr("a🙃🙃b", "🙃", "UTF8_LCASE", 2);
+ assertStringInstr("a🙃🙃b", "🙃", "UNICODE", 2);
+ assertStringInstr("a🙃🙃b", "🙃", "UNICODE_CI", 2);
+ assertStringInstr("a🙃🙃b", "b", "UTF8_BINARY", 4);
+ assertStringInstr("a🙃🙃b", "b", "UTF8_LCASE", 4);
+ assertStringInstr("a🙃🙃b", "b", "UNICODE", 4);
+ assertStringInstr("a🙃🙃b", "b", "UNICODE_CI", 4);
+ assertStringInstr("a🙃x🙃b", "b", "UTF8_BINARY", 5);
+ assertStringInstr("a🙃x🙃b", "b", "UTF8_LCASE", 5);
+ assertStringInstr("a🙃x🙃b", "b", "UNICODE", 5);
+ assertStringInstr("a🙃x🙃b", "b", "UNICODE_CI", 5);
}
+ /**
+ * Verify the behaviour of the `FindInSet` collation support class.
+ */
+
private void assertFindInSet(String word, UTF8String set, String collationName,
- Integer expected) throws SparkException {
+ int expected) throws SparkException {
UTF8String w = UTF8String.fromString(word);
int collationId = CollationFactory.collationNameToId(collationName);
- assertEquals(expected, CollationSupport.FindInSet.exec(w, set, collationId));
+ int result = CollationSupport.FindInSet.exec(w, set, collationId);
+ assertEquals(expected, result);
}
@Test
public void testFindInSet() throws SparkException {
- assertFindInSet("AB", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0);
- assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 1);
- assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 5);
- assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0);
+ // Empty strings.
+ assertFindInSet("", UTF8String.fromString(""), "UTF8_BINARY", 1);
+ assertFindInSet("", UTF8String.fromString(""), "UTF8_LCASE", 1);
+ assertFindInSet("", UTF8String.fromString(""), "UNICODE", 1);
+ assertFindInSet("", UTF8String.fromString(""), "UNICODE_CI", 1);
assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0);
- assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UTF8_BINARY", 1);
- assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_BINARY", 6);
- assertFindInSet("", UTF8String.fromString("abc"), "UTF8_BINARY", 0);
- assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0);
- assertFindInSet("c", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 4);
- assertFindInSet("AB", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 3);
- assertFindInSet("AbC", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 1);
- assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0);
- assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0);
- assertFindInSet("XX", UTF8String.fromString("xx"), "UTF8_LCASE", 1);
assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0);
+ assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
+ assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0);
+ assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UTF8_BINARY", 1);
assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UTF8_LCASE", 1);
- assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_LCASE", 6);
- assertFindInSet("", UTF8String.fromString("abc"), "UTF8_LCASE", 0);
- assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UTF8_LCASE", 4);
- assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
- assertFindInSet("ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 3);
- assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
- assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UNICODE", 1);
+ assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UNICODE_CI", 1);
+ assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_BINARY", 6);
+ assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_LCASE", 6);
assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UNICODE", 6);
+ assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UNICODE_CI", 6);
+ assertFindInSet("", UTF8String.fromString("abc"), "UTF8_BINARY", 0);
+ assertFindInSet("", UTF8String.fromString("abc"), "UTF8_LCASE", 0);
assertFindInSet("", UTF8String.fromString("abc"), "UNICODE", 0);
+ assertFindInSet("", UTF8String.fromString("abc"), "UNICODE_CI", 0);
+ // Basic tests.
+ assertFindInSet("xx", UTF8String.fromString("xx"), "UTF8_BINARY", 1);
+ assertFindInSet("xx", UTF8String.fromString("xx"), "UTF8_LCASE", 1);
assertFindInSet("xx", UTF8String.fromString("xx"), "UNICODE", 1);
- assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE", 0);
- assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE", 5);
+ assertFindInSet("xx", UTF8String.fromString("xx"), "UNICODE_CI", 1);
+ assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0);
+ assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0);
+ assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0);
+ assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 1);
+ assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 1);
+ assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 1);
+ assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 1);
+ assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0);
+ assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0);
+ assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
+ assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0);
+ assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 5);
+ assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 5);
+ assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 5);
+ assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 5);
+ assertFindInSet("xyz", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0);
+ assertFindInSet("xyz", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0);
+ assertFindInSet("xyz", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
+ assertFindInSet("xyz", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0);
+ assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0);
+ assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 3);
+ assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
+ assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 3);
+ assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0);
+ assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0);
+ assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
+ assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0);
+ assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0);
+ assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 4);
+ assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 4);
- assertFindInSet("DeF", UTF8String.fromString("abc,b,ab,c,dEf"), "UNICODE_CI", 5);
- assertFindInSet("DEFG", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0);
- assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UNICODE_CI", 1);
- assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UNICODE_CI", 6);
- assertFindInSet("", UTF8String.fromString("abc"), "UNICODE_CI", 0);
- assertFindInSet("XX", UTF8String.fromString("xx"), "UNICODE_CI", 1);
+ // Advanced tests.
+ assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UTF8_BINARY", 5);
+ assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UTF8_LCASE", 5);
+ assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE", 5);
+ assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE_CI", 5);
+ assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UTF8_BINARY", 0);
+ assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UTF8_LCASE", 4);
+ assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE", 0);
assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE_CI", 4);
+ assertFindInSet("界x", UTF8String.fromString("test,大千,界Xx,世,界X,大,千,世界"), "UTF8_BINARY", 0);
+ assertFindInSet("界x", UTF8String.fromString("test,大千,界Xx,世,界X,大,千,世界"), "UTF8_LCASE", 5);
+ assertFindInSet("界x", UTF8String.fromString("test,大千,界Xx,世,界X,大,千,世界"), "UNICODE", 0);
assertFindInSet("界x", UTF8String.fromString("test,大千,界Xx,世,界X,大,千,世界"), "UNICODE_CI", 5);
- assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE_CI", 5);
- assertFindInSet("i̇", UTF8String.fromString("İ"), "UNICODE_CI", 1);
- assertFindInSet("i", UTF8String.fromString("İ"), "UNICODE_CI", 0);
- assertFindInSet("i̇", UTF8String.fromString("i̇"), "UNICODE_CI", 1);
- assertFindInSet("i", UTF8String.fromString("i̇"), "UNICODE_CI", 0);
- assertFindInSet("i̇", UTF8String.fromString("İ,"), "UNICODE_CI", 1);
- assertFindInSet("i", UTF8String.fromString("İ,"), "UNICODE_CI", 0);
- assertFindInSet("i̇", UTF8String.fromString("i̇,"), "UNICODE_CI", 1);
- assertFindInSet("i", UTF8String.fromString("i̇,"), "UNICODE_CI", 0);
- assertFindInSet("i̇", UTF8String.fromString("ab,İ"), "UNICODE_CI", 2);
- assertFindInSet("i", UTF8String.fromString("ab,İ"), "UNICODE_CI", 0);
- assertFindInSet("i̇", UTF8String.fromString("ab,i̇"), "UNICODE_CI", 2);
- assertFindInSet("i", UTF8String.fromString("ab,i̇"), "UNICODE_CI", 0);
- assertFindInSet("i̇", UTF8String.fromString("ab,İ,12"), "UNICODE_CI", 2);
- assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UNICODE_CI", 0);
- assertFindInSet("i̇", UTF8String.fromString("ab,i̇,12"), "UNICODE_CI", 2);
- assertFindInSet("i", UTF8String.fromString("ab,i̇,12"), "UNICODE_CI", 0);
- assertFindInSet("i̇o", UTF8String.fromString("ab,İo,12"), "UNICODE_CI", 2);
- assertFindInSet("İo", UTF8String.fromString("ab,i̇o,12"), "UNICODE_CI", 2);
- assertFindInSet("i̇", UTF8String.fromString("İ"), "UTF8_LCASE", 1);
+ // One-to-many case mapping (e.g. Turkish dotted I).
+ assertFindInSet("i\u0307", UTF8String.fromString("İ"), "UTF8_BINARY", 0);
+ assertFindInSet("i\u0307", UTF8String.fromString("İ"), "UTF8_LCASE", 1);
+ assertFindInSet("i\u0307", UTF8String.fromString("İ"), "UNICODE", 0);
+ assertFindInSet("i\u0307", UTF8String.fromString("İ"), "UNICODE_CI", 1);
+ assertFindInSet("i", UTF8String.fromString("İ"), "UTF8_BINARY", 0);
assertFindInSet("i", UTF8String.fromString("İ"), "UTF8_LCASE", 0);
- assertFindInSet("i̇", UTF8String.fromString("i̇"), "UTF8_LCASE", 1);
- assertFindInSet("i", UTF8String.fromString("i̇"), "UTF8_LCASE", 0);
- assertFindInSet("i̇", UTF8String.fromString("İ,"), "UTF8_LCASE", 1);
+ assertFindInSet("i", UTF8String.fromString("İ"), "UNICODE", 0);
+ assertFindInSet("i", UTF8String.fromString("İ"), "UNICODE_CI", 0);
+ assertFindInSet("i\u0307", UTF8String.fromString("i\u0307"), "UTF8_BINARY", 1);
+ assertFindInSet("i\u0307", UTF8String.fromString("i\u0307"), "UTF8_LCASE", 1);
+ assertFindInSet("i\u0307", UTF8String.fromString("i\u0307"), "UNICODE", 1);
+ assertFindInSet("i\u0307", UTF8String.fromString("i\u0307"), "UNICODE_CI", 1);
+ assertFindInSet("i", UTF8String.fromString("i\u0307"), "UTF8_BINARY", 0);
+ assertFindInSet("i", UTF8String.fromString("i\u0307"), "UTF8_LCASE", 0);
+ assertFindInSet("i", UTF8String.fromString("i\u0307"), "UNICODE", 0);
+ assertFindInSet("i", UTF8String.fromString("i\u0307"), "UNICODE_CI", 0);
+ assertFindInSet("i\u0307", UTF8String.fromString("İ,"), "UTF8_BINARY", 0);
+ assertFindInSet("i\u0307", UTF8String.fromString("İ,"), "UTF8_LCASE", 1);
+ assertFindInSet("i\u0307", UTF8String.fromString("İ,"), "UNICODE", 0);
+ assertFindInSet("i\u0307", UTF8String.fromString("İ,"), "UNICODE_CI", 1);
+ assertFindInSet("i", UTF8String.fromString("İ,"), "UTF8_BINARY", 0);
assertFindInSet("i", UTF8String.fromString("İ,"), "UTF8_LCASE", 0);
- assertFindInSet("i̇", UTF8String.fromString("i̇,"), "UTF8_LCASE", 1);
- assertFindInSet("i", UTF8String.fromString("i̇,"), "UTF8_LCASE", 0);
- assertFindInSet("i̇", UTF8String.fromString("ab,İ"), "UTF8_LCASE", 2);
+ assertFindInSet("i", UTF8String.fromString("İ,"), "UNICODE", 0);
+ assertFindInSet("i", UTF8String.fromString("İ,"), "UNICODE_CI", 0);
+ assertFindInSet("i\u0307", UTF8String.fromString("i\u0307,"), "UTF8_BINARY", 1);
+ assertFindInSet("i\u0307", UTF8String.fromString("i\u0307,"), "UTF8_LCASE", 1);
+ assertFindInSet("i\u0307", UTF8String.fromString("i\u0307,"), "UNICODE", 1);
+ assertFindInSet("i\u0307", UTF8String.fromString("i\u0307,"), "UNICODE_CI", 1);
+ assertFindInSet("i", UTF8String.fromString("i\u0307,"), "UTF8_BINARY", 0);
+ assertFindInSet("i", UTF8String.fromString("i\u0307,"), "UTF8_LCASE", 0);
+ assertFindInSet("i", UTF8String.fromString("i\u0307,"), "UNICODE", 0);
+ assertFindInSet("i", UTF8String.fromString("i\u0307,"), "UNICODE_CI", 0);
+ assertFindInSet("i\u0307", UTF8String.fromString("ab,İ"), "UTF8_BINARY", 0);
+ assertFindInSet("i\u0307", UTF8String.fromString("ab,İ"), "UTF8_LCASE", 2);
+ assertFindInSet("i\u0307", UTF8String.fromString("ab,İ"), "UNICODE", 0);
+ assertFindInSet("i\u0307", UTF8String.fromString("ab,İ"), "UNICODE_CI", 2);
+ assertFindInSet("i", UTF8String.fromString("ab,İ"), "UTF8_BINARY", 0);
assertFindInSet("i", UTF8String.fromString("ab,İ"), "UTF8_LCASE", 0);
- assertFindInSet("i̇", UTF8String.fromString("ab,i̇"), "UTF8_LCASE", 2);
- assertFindInSet("i", UTF8String.fromString("ab,i̇"), "UTF8_LCASE", 0);
- assertFindInSet("i̇", UTF8String.fromString("ab,İ,12"), "UTF8_LCASE", 2);
+ assertFindInSet("i", UTF8String.fromString("ab,İ"), "UNICODE", 0);
+ assertFindInSet("i", UTF8String.fromString("ab,İ"), "UNICODE_CI", 0);
+ assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307"), "UTF8_BINARY", 2);
+ assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307"), "UTF8_LCASE", 2);
+ assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307"), "UNICODE", 2);
+ assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307"), "UNICODE_CI", 2);
+ assertFindInSet("i", UTF8String.fromString("ab,i\u0307"), "UTF8_BINARY", 0);
+ assertFindInSet("i", UTF8String.fromString("ab,i\u0307"), "UTF8_LCASE", 0);
+ assertFindInSet("i", UTF8String.fromString("ab,i\u0307"), "UNICODE", 0);
+ assertFindInSet("i", UTF8String.fromString("ab,i\u0307"), "UNICODE_CI", 0);
+ assertFindInSet("İ", UTF8String.fromString("ab,i\u0307"), "UTF8_BINARY", 0);
+ assertFindInSet("İ", UTF8String.fromString("ab,i\u0307"), "UTF8_LCASE", 2);
+ assertFindInSet("İ", UTF8String.fromString("ab,i\u0307"), "UNICODE", 0);
+ assertFindInSet("İ", UTF8String.fromString("ab,i\u0307"), "UNICODE_CI", 2);
+ assertFindInSet("i\u0307", UTF8String.fromString("ab,İ,12"), "UTF8_BINARY", 0);
+ assertFindInSet("i\u0307", UTF8String.fromString("ab,İ,12"), "UTF8_LCASE", 2);
+ assertFindInSet("i\u0307", UTF8String.fromString("ab,İ,12"), "UNICODE", 0);
+ assertFindInSet("i\u0307", UTF8String.fromString("ab,İ,12"), "UNICODE_CI", 2);
+ assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UTF8_BINARY", 0);
assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UTF8_LCASE", 0);
- assertFindInSet("i̇", UTF8String.fromString("ab,i̇,12"), "UTF8_LCASE", 2);
- assertFindInSet("i", UTF8String.fromString("ab,i̇,12"), "UTF8_LCASE", 0);
- assertFindInSet("i̇o", UTF8String.fromString("ab,İo,12"), "UTF8_LCASE", 2);
- assertFindInSet("İo", UTF8String.fromString("ab,i̇o,12"), "UTF8_LCASE", 2);
+ assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UNICODE", 0);
+ assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UNICODE_CI", 0);
+ assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307,12"), "UTF8_BINARY", 2);
+ assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307,12"), "UTF8_LCASE", 2);
+ assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307,12"), "UNICODE", 2);
+ assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307,12"), "UNICODE_CI", 2);
+ assertFindInSet("i", UTF8String.fromString("ab,i\u0307,12"), "UTF8_BINARY", 0);
+ assertFindInSet("i", UTF8String.fromString("ab,i\u0307,12"), "UTF8_LCASE", 0);
+ assertFindInSet("i", UTF8String.fromString("ab,i\u0307,12"), "UNICODE", 0);
+ assertFindInSet("i", UTF8String.fromString("ab,i\u0307,12"), "UNICODE_CI", 0);
+ assertFindInSet("i\u0307o", UTF8String.fromString("ab,İo,12"), "UTF8_BINARY", 0);
+ assertFindInSet("i\u0307o", UTF8String.fromString("ab,İo,12"), "UTF8_LCASE", 2);
+ assertFindInSet("i\u0307o", UTF8String.fromString("ab,İo,12"), "UNICODE", 0);
+ assertFindInSet("i\u0307o", UTF8String.fromString("ab,İo,12"), "UNICODE_CI", 2);
+ assertFindInSet("İo", UTF8String.fromString("ab,i\u0307o,12"), "UTF8_BINARY", 0);
+ assertFindInSet("İo", UTF8String.fromString("ab,i\u0307o,12"), "UTF8_LCASE", 2);
+ assertFindInSet("İo", UTF8String.fromString("ab,i\u0307o,12"), "UNICODE", 0);
+ assertFindInSet("İo", UTF8String.fromString("ab,i\u0307o,12"), "UNICODE_CI", 2);
+ // Conditional case mapping (e.g. Greek sigmas).
+ assertFindInSet("σ", UTF8String.fromString("σ"), "UTF8_BINARY", 1);
+ assertFindInSet("σ", UTF8String.fromString("ς"), "UTF8_BINARY", 0);
+ assertFindInSet("σ", UTF8String.fromString("Σ"), "UTF8_BINARY", 0);
+ assertFindInSet("ς", UTF8String.fromString("σ"), "UTF8_BINARY", 0);
+ assertFindInSet("ς", UTF8String.fromString("ς"), "UTF8_BINARY", 1);
+ assertFindInSet("ς", UTF8String.fromString("Σ"), "UTF8_BINARY", 0);
+ assertFindInSet("Σ", UTF8String.fromString("σ"), "UTF8_BINARY", 0);
+ assertFindInSet("Σ", UTF8String.fromString("ς"), "UTF8_BINARY", 0);
+ assertFindInSet("Σ", UTF8String.fromString("Σ"), "UTF8_BINARY", 1);
+ assertFindInSet("σ", UTF8String.fromString("σ"), "UTF8_LCASE", 1);
+ assertFindInSet("σ", UTF8String.fromString("ς"), "UTF8_LCASE", 1);
+ assertFindInSet("σ", UTF8String.fromString("Σ"), "UTF8_LCASE", 1);
+ assertFindInSet("ς", UTF8String.fromString("σ"), "UTF8_LCASE", 1);
+ assertFindInSet("ς", UTF8String.fromString("ς"), "UTF8_LCASE", 1);
+ assertFindInSet("ς", UTF8String.fromString("Σ"), "UTF8_LCASE", 1);
+ assertFindInSet("Σ", UTF8String.fromString("σ"), "UTF8_LCASE", 1);
+ assertFindInSet("Σ", UTF8String.fromString("ς"), "UTF8_LCASE", 1);
+ assertFindInSet("Σ", UTF8String.fromString("Σ"), "UTF8_LCASE", 1);
+ assertFindInSet("σ", UTF8String.fromString("σ"), "UNICODE", 1);
+ assertFindInSet("σ", UTF8String.fromString("ς"), "UNICODE", 0);
+ assertFindInSet("σ", UTF8String.fromString("Σ"), "UNICODE", 0);
+ assertFindInSet("ς", UTF8String.fromString("σ"), "UNICODE", 0);
+ assertFindInSet("ς", UTF8String.fromString("ς"), "UNICODE", 1);
+ assertFindInSet("ς", UTF8String.fromString("Σ"), "UNICODE", 0);
+ assertFindInSet("Σ", UTF8String.fromString("σ"), "UNICODE", 0);
+ assertFindInSet("Σ", UTF8String.fromString("ς"), "UNICODE", 0);
+ assertFindInSet("Σ", UTF8String.fromString("Σ"), "UNICODE", 1);
+ assertFindInSet("σ", UTF8String.fromString("σ"), "UNICODE_CI", 1);
+ assertFindInSet("σ", UTF8String.fromString("ς"), "UNICODE_CI", 1);
+ assertFindInSet("σ", UTF8String.fromString("Σ"), "UNICODE_CI", 1);
+ assertFindInSet("ς", UTF8String.fromString("σ"), "UNICODE_CI", 1);
+ assertFindInSet("ς", UTF8String.fromString("ς"), "UNICODE_CI", 1);
+ assertFindInSet("ς", UTF8String.fromString("Σ"), "UNICODE_CI", 1);
+ assertFindInSet("Σ", UTF8String.fromString("σ"), "UNICODE_CI", 1);
+ assertFindInSet("Σ", UTF8String.fromString("ς"), "UNICODE_CI", 1);
+ assertFindInSet("Σ", UTF8String.fromString("Σ"), "UNICODE_CI", 1);
+ // Surrogate pairs.
+ assertFindInSet("a", UTF8String.fromString("a🙃,b,🙃c"), "UTF8_BINARY", 0);
+ assertFindInSet("a", UTF8String.fromString("a🙃,b,🙃c"), "UTF8_LCASE", 0);
+ assertFindInSet("a", UTF8String.fromString("a🙃,b,🙃c"), "UNICODE", 0);
+ assertFindInSet("a", UTF8String.fromString("a🙃,b,🙃c"), "UNICODE_CI", 0);
+ assertFindInSet("a🙃", UTF8String.fromString("a🙃,b,🙃c"), "UTF8_BINARY", 1);
+ assertFindInSet("a🙃", UTF8String.fromString("a🙃,b,🙃c"), "UTF8_LCASE", 1);
+ assertFindInSet("a🙃", UTF8String.fromString("a🙃,b,🙃c"), "UNICODE", 1);
+ assertFindInSet("a🙃", UTF8String.fromString("a🙃,b,🙃c"), "UNICODE_CI", 1);
+ assertFindInSet("b", UTF8String.fromString("a🙃,b,🙃c"), "UTF8_BINARY", 2);
+ assertFindInSet("b", UTF8String.fromString("a🙃,b,🙃c"), "UTF8_LCASE", 2);
+ assertFindInSet("b", UTF8String.fromString("a🙃,b,🙃c"), "UNICODE", 2);
+ assertFindInSet("b", UTF8String.fromString("a🙃,b,🙃c"), "UNICODE_CI", 2);
+ assertFindInSet("🙃c", UTF8String.fromString("a🙃,b,🙃c"), "UTF8_BINARY", 3);
+ assertFindInSet("🙃c", UTF8String.fromString("a🙃,b,🙃c"), "UTF8_LCASE", 3);
+ assertFindInSet("🙃c", UTF8String.fromString("a🙃,b,🙃c"), "UNICODE", 3);
+ assertFindInSet("🙃c", UTF8String.fromString("a🙃,b,🙃c"), "UNICODE_CI", 3);
+ assertFindInSet("😄😆", UTF8String.fromString("😀😆,😃😄"), "UTF8_BINARY", 0);
+ assertFindInSet("😄😆", UTF8String.fromString("😀😆,😃😄"), "UTF8_LCASE", 0);
+ assertFindInSet("😄😆", UTF8String.fromString("😀😆,😃😄"), "UNICODE", 0);
+ assertFindInSet("😄😆", UTF8String.fromString("😀😆,😃😄"), "UNICODE_CI", 0);
+ assertFindInSet("😀😆", UTF8String.fromString("😀😆,😃😄"), "UTF8_BINARY", 1);
+ assertFindInSet("😀😆", UTF8String.fromString("😀😆,😃😄"), "UTF8_LCASE", 1);
+ assertFindInSet("😀😆", UTF8String.fromString("😀😆,😃😄"), "UNICODE", 1);
+ assertFindInSet("😀😆", UTF8String.fromString("😀😆,😃😄"), "UNICODE_CI", 1);
+ assertFindInSet("😃😄", UTF8String.fromString("😀😆,😃😄"), "UTF8_BINARY", 2);
+ assertFindInSet("😃😄", UTF8String.fromString("😀😆,😃😄"), "UTF8_LCASE", 2);
+ assertFindInSet("😃😄", UTF8String.fromString("😀😆,😃😄"), "UNICODE", 2);
+ assertFindInSet("😃😄", UTF8String.fromString("😀😆,😃😄"), "UNICODE_CI", 2);
+ assertFindInSet("x", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_BINARY", 0);
+ assertFindInSet("x", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_LCASE", 0);
+ assertFindInSet("x", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE", 0);
+ assertFindInSet("x", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE_CI", 0);
+ assertFindInSet("a", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_BINARY", 1);
+ assertFindInSet("a", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_LCASE", 1);
+ assertFindInSet("a", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE", 1);
+ assertFindInSet("a", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE_CI", 1);
+ assertFindInSet("A", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_BINARY", 0);
+ assertFindInSet("A", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_LCASE", 1);
+ assertFindInSet("A", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE", 0);
+ assertFindInSet("A", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE_CI", 1);
+ assertFindInSet("𝔸", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_BINARY", 3);
+ assertFindInSet("𝔸", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_LCASE", 3);
+ assertFindInSet("𝔸", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE", 3);
+ assertFindInSet("𝔸", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE_CI", 1);
+ assertFindInSet("𐐅", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_BINARY", 2);
+ assertFindInSet("𐐅", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_LCASE", 2);
+ assertFindInSet("𐐅", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE", 2);
+ assertFindInSet("𐐅", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE_CI", 2);
+ assertFindInSet("𐐭", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_BINARY", 0);
+ assertFindInSet("𐐭", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_LCASE", 2);
+ assertFindInSet("𐐭", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE", 0);
+ assertFindInSet("𐐭", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE_CI", 2);
// Invalid UTF8 strings
assertFindInSet("C", UTF8String.fromBytes(
new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }),
@@ -983,147 +1969,581 @@ public void testFindInSet() throws SparkException {
"UNICODE_CI", 2);
}
- private void assertReplace(String source, String search, String replace, String collationName,
- String expected) throws SparkException {
- UTF8String src = UTF8String.fromString(source);
+ /**
+ * Verify the behaviour of the `StringReplace` collation support class.
+ */
+
+ private void assertStringReplace(String source, String search, String replace,
+ String collationName, String expected) throws SparkException {
+ UTF8String src = UTF8String.fromString(source);
UTF8String sear = UTF8String.fromString(search);
UTF8String repl = UTF8String.fromString(replace);
int collationId = CollationFactory.collationNameToId(collationName);
- assertEquals(expected, CollationSupport.StringReplace
- .exec(src, sear, repl, collationId).toString());
+ UTF8String result = CollationSupport.StringReplace.exec(src, sear, repl, collationId);
+ assertEquals(UTF8String.fromString(expected), result);
}
@Test
- public void testReplace() throws SparkException {
- assertReplace("r世eplace", "pl", "123", "UTF8_BINARY", "r世e123ace");
- assertReplace("replace", "pl", "", "UTF8_BINARY", "reace");
- assertReplace("repl世ace", "Pl", "", "UTF8_BINARY", "repl世ace");
- assertReplace("replace", "", "123", "UTF8_BINARY", "replace");
- assertReplace("abcabc", "b", "12", "UTF8_BINARY", "a12ca12c");
- assertReplace("abcdabcd", "bc", "", "UTF8_BINARY", "adad");
- assertReplace("r世eplace", "pl", "xx", "UTF8_LCASE", "r世exxace");
- assertReplace("repl世ace", "PL", "AB", "UTF8_LCASE", "reAB世ace");
- assertReplace("Replace", "", "123", "UTF8_LCASE", "Replace");
- assertReplace("re世place", "世", "x", "UTF8_LCASE", "rexplace");
- assertReplace("abcaBc", "B", "12", "UTF8_LCASE", "a12ca12c");
- assertReplace("AbcdabCd", "Bc", "", "UTF8_LCASE", "Adad");
- assertReplace("re世place", "plx", "123", "UNICODE", "re世place");
- assertReplace("世Replace", "re", "", "UNICODE", "世Replace");
- assertReplace("replace世", "", "123", "UNICODE", "replace世");
- assertReplace("aBc世abc", "b", "12", "UNICODE", "aBc世a12c");
- assertReplace("abcdabcd", "bc", "", "UNICODE", "adad");
- assertReplace("replace", "plx", "123", "UNICODE_CI", "replace");
- assertReplace("Replace", "re", "", "UNICODE_CI", "place");
- assertReplace("replace", "", "123", "UNICODE_CI", "replace");
- assertReplace("aBc世abc", "b", "12", "UNICODE_CI", "a12c世a12c");
- assertReplace("a世Bcdabcd", "bC", "", "UNICODE_CI", "a世dad");
- assertReplace("abi̇12", "i", "X", "UNICODE_CI", "abi̇12");
- assertReplace("abi̇12", "\u0307", "X", "UNICODE_CI", "abi̇12");
- assertReplace("abi̇12", "İ", "X", "UNICODE_CI", "abX12");
- assertReplace("abİ12", "i", "X", "UNICODE_CI", "abİ12");
- assertReplace("İi̇İi̇İi̇", "i̇", "x", "UNICODE_CI", "xxxxxx");
- assertReplace("İi̇İi̇İi̇", "i", "x", "UNICODE_CI", "İi̇İi̇İi̇");
- assertReplace("abİo12i̇o", "i̇o", "xx", "UNICODE_CI", "abxx12xx");
- assertReplace("abi̇o12i̇o", "İo", "yy", "UNICODE_CI", "abyy12yy");
- assertReplace("abi̇12", "i", "X", "UTF8_LCASE", "abX\u030712"); // != UNICODE_CI
- assertReplace("abi̇12", "\u0307", "X", "UTF8_LCASE", "abiX12"); // != UNICODE_CI
- assertReplace("abi̇12", "İ", "X", "UTF8_LCASE", "abX12");
- assertReplace("abİ12", "i", "X", "UTF8_LCASE", "abİ12");
- assertReplace("İi̇İi̇İi̇", "i̇", "x", "UTF8_LCASE", "xxxxxx");
- assertReplace("İi̇İi̇İi̇", "i", "x", "UTF8_LCASE",
+ public void testStringReplace() throws SparkException {
+ // Empty strings.
+ assertStringReplace("", "", "", "UTF8_BINARY", "");
+ assertStringReplace("", "", "", "UTF8_LCASE", "");
+ assertStringReplace("", "", "", "UNICODE", "");
+ assertStringReplace("", "", "", "UNICODE_CI", "");
+ assertStringReplace("abc", "", "", "UTF8_BINARY", "abc");
+ assertStringReplace("abc", "", "", "UTF8_LCASE", "abc");
+ assertStringReplace("abc", "", "", "UNICODE", "abc");
+ assertStringReplace("abc", "", "", "UNICODE_CI", "abc");
+ assertStringReplace("", "x", "", "UTF8_BINARY", "");
+ assertStringReplace("", "x", "", "UTF8_LCASE", "");
+ assertStringReplace("", "x", "", "UNICODE", "");
+ assertStringReplace("", "x", "", "UNICODE_CI", "");
+ assertStringReplace("", "", "x", "UTF8_BINARY", "");
+ assertStringReplace("", "", "x", "UTF8_LCASE", "");
+ assertStringReplace("", "", "x", "UNICODE", "");
+ assertStringReplace("", "", "x", "UNICODE_CI", "");
+ assertStringReplace("", "b", "x", "UTF8_BINARY", "");
+ assertStringReplace("", "b", "x", "UTF8_LCASE", "");
+ assertStringReplace("", "b", "x", "UNICODE", "");
+ assertStringReplace("", "b", "x", "UNICODE_CI", "");
+ assertStringReplace("abc", "b", "", "UTF8_BINARY", "ac");
+ assertStringReplace("abc", "b", "", "UTF8_LCASE", "ac");
+ assertStringReplace("abc", "b", "", "UNICODE", "ac");
+ assertStringReplace("abc", "b", "", "UNICODE_CI", "ac");
+ assertStringReplace("abc", "", "x", "UTF8_BINARY", "abc");
+ assertStringReplace("abc", "", "x", "UTF8_LCASE", "abc");
+ assertStringReplace("abc", "", "x", "UNICODE", "abc");
+ assertStringReplace("abc", "", "x", "UNICODE_CI", "abc");
+ // Basic tests.
+ assertStringReplace("replace", "pl", "", "UTF8_BINARY", "reace");
+ assertStringReplace("replace", "pl", "", "UTF8_LCASE", "reace");
+ assertStringReplace("replace", "pl", "", "UNICODE", "reace");
+ assertStringReplace("replace", "pl", "", "UNICODE_CI", "reace");
+ assertStringReplace("replace", "", "123", "UTF8_BINARY", "replace");
+ assertStringReplace("replace", "", "123", "UTF8_LCASE", "replace");
+ assertStringReplace("replace", "", "123", "UNICODE", "replace");
+ assertStringReplace("replace", "", "123", "UNICODE_CI", "replace");
+ assertStringReplace("abcabc", "b", "12", "UTF8_BINARY", "a12ca12c");
+ assertStringReplace("abcabc", "b", "12", "UTF8_LCASE", "a12ca12c");
+ assertStringReplace("abcabc", "b", "12", "UNICODE", "a12ca12c");
+ assertStringReplace("abcabc", "b", "12", "UNICODE_CI", "a12ca12c");
+ assertStringReplace("replace", "plx", "123", "UTF8_BINARY", "replace");
+ assertStringReplace("replace", "plx", "123", "UTF8_LCASE", "replace");
+ assertStringReplace("replace", "plx", "123", "UNICODE", "replace");
+ assertStringReplace("replace", "plx", "123", "UNICODE_CI", "replace");
+ assertStringReplace("Replace", "re", "", "UTF8_BINARY", "Replace");
+ assertStringReplace("Replace", "re", "", "UTF8_LCASE", "place");
+ assertStringReplace("Replace", "re", "", "UNICODE", "Replace");
+ assertStringReplace("Replace", "re", "", "UNICODE_CI", "place");
+ assertStringReplace("abcdabcd", "Bc", "", "UTF8_BINARY", "abcdabcd");
+ assertStringReplace("abcdabcd", "Bc", "", "UTF8_LCASE", "adad");
+ assertStringReplace("abcdabcd", "Bc", "", "UNICODE", "abcdabcd");
+ assertStringReplace("abcdabcd", "Bc", "", "UNICODE_CI", "adad");
+ assertStringReplace("AbcdabCd", "Bc", "", "UTF8_BINARY", "AbcdabCd");
+ assertStringReplace("AbcdabCd", "Bc", "", "UTF8_LCASE", "Adad");
+ assertStringReplace("AbcdabCd", "Bc", "", "UNICODE", "AbcdabCd");
+ assertStringReplace("AbcdabCd", "Bc", "", "UNICODE_CI", "Adad");
+ // Advanced tests.
+ assertStringReplace("abcdabcd", "bc", "", "UTF8_BINARY", "adad");
+ assertStringReplace("r世eplace", "pl", "123", "UTF8_BINARY", "r世e123ace");
+ assertStringReplace("世Replace", "re", "", "UTF8_BINARY", "世Replace");
+ assertStringReplace("r世eplace", "pl", "xx", "UTF8_LCASE", "r世exxace");
+ assertStringReplace("repl世ace", "PL", "AB", "UTF8_LCASE", "reAB世ace");
+ assertStringReplace("re世place", "世", "x", "UTF8_LCASE", "rexplace");
+ assertStringReplace("re世place", "plx", "123", "UNICODE", "re世place");
+ assertStringReplace("replace世", "", "123", "UNICODE", "replace世");
+ assertStringReplace("aBc世abc", "b", "12", "UNICODE", "aBc世a12c");
+ assertStringReplace("aBc世abc", "b", "12", "UNICODE_CI", "a12c世a12c");
+ assertStringReplace("a世Bcdabcd", "bC", "", "UNICODE_CI", "a世dad");
+ assertStringReplace("repl世ace", "Pl", "", "UNICODE_CI", "re世ace");
+ assertStringReplace("abcčšdabĆŠscd", "cs", "", "SR_CI_AI", "abcdabscd");
+ // One-to-many case mapping (e.g. Turkish dotted I).
+ assertStringReplace("abi̇12", "i", "X", "UNICODE_CI", "abi̇12");
+ assertStringReplace("abi̇12", "\u0307", "X", "UNICODE_CI", "abi̇12");
+ assertStringReplace("abi̇12", "İ", "X", "UNICODE_CI", "abX12");
+ assertStringReplace("abİ12", "i", "X", "UNICODE_CI", "abİ12");
+ assertStringReplace("İi̇İi̇İi̇", "i\u0307", "x", "UNICODE_CI", "xxxxxx");
+ assertStringReplace("İi̇İi̇İi̇", "i", "x", "UNICODE_CI", "İi̇İi̇İi̇");
+ assertStringReplace("abİo12i̇o", "i\u0307o", "xx", "UNICODE_CI", "abxx12xx");
+ assertStringReplace("abi̇o12i̇o", "İo", "yy", "UNICODE_CI", "abyy12yy");
+ assertStringReplace("abi̇12", "i", "X", "UTF8_LCASE", "abX\u030712"); // != UNICODE_CI
+ assertStringReplace("abi̇12", "\u0307", "X", "UTF8_LCASE", "abiX12"); // != UNICODE_CI
+ assertStringReplace("abi̇12", "İ", "X", "UTF8_LCASE", "abX12");
+ assertStringReplace("abİ12", "i", "X", "UTF8_LCASE", "abİ12");
+ assertStringReplace("İi̇İi̇İi̇", "i\u0307", "x", "UTF8_LCASE", "xxxxxx");
+ assertStringReplace("İi̇İi̇İi̇", "i", "x", "UTF8_LCASE",
"İx\u0307İx\u0307İx\u0307"); // != UNICODE_CI
- assertReplace("abİo12i̇o", "i̇o", "xx", "UTF8_LCASE", "abxx12xx");
- assertReplace("abi̇o12i̇o", "İo", "yy", "UTF8_LCASE", "abyy12yy");
+ assertStringReplace("abİo12i̇o", "i\u0307o", "xx", "UTF8_LCASE", "abxx12xx");
+ assertStringReplace("abi̇o12i̇o", "İo", "yy", "UTF8_LCASE", "abyy12yy");
+ // Conditional case mapping (e.g. Greek sigmas).
+ assertStringReplace("σ", "σ", "x", "UTF8_BINARY", "x");
+ assertStringReplace("σ", "ς", "x", "UTF8_BINARY", "σ");
+ assertStringReplace("σ", "Σ", "x", "UTF8_BINARY", "σ");
+ assertStringReplace("ς", "σ", "x", "UTF8_BINARY", "ς");
+ assertStringReplace("ς", "ς", "x", "UTF8_BINARY", "x");
+ assertStringReplace("ς", "Σ", "x", "UTF8_BINARY", "ς");
+ assertStringReplace("Σ", "σ", "x", "UTF8_BINARY", "Σ");
+ assertStringReplace("Σ", "ς", "x", "UTF8_BINARY", "Σ");
+ assertStringReplace("Σ", "Σ", "x", "UTF8_BINARY", "x");
+ assertStringReplace("σ", "σ", "x", "UTF8_LCASE", "x");
+ assertStringReplace("σ", "ς", "x", "UTF8_LCASE", "x");
+ assertStringReplace("σ", "Σ", "x", "UTF8_LCASE", "x");
+ assertStringReplace("ς", "σ", "x", "UTF8_LCASE", "x");
+ assertStringReplace("ς", "ς", "x", "UTF8_LCASE", "x");
+ assertStringReplace("ς", "Σ", "x", "UTF8_LCASE", "x");
+ assertStringReplace("Σ", "σ", "x", "UTF8_LCASE", "x");
+ assertStringReplace("Σ", "ς", "x", "UTF8_LCASE", "x");
+ assertStringReplace("Σ", "Σ", "x", "UTF8_LCASE", "x");
+ assertStringReplace("σ", "σ", "x", "UNICODE", "x");
+ assertStringReplace("σ", "ς", "x", "UNICODE", "σ");
+ assertStringReplace("σ", "Σ", "x", "UNICODE", "σ");
+ assertStringReplace("ς", "σ", "x", "UNICODE", "ς");
+ assertStringReplace("ς", "ς", "x", "UNICODE", "x");
+ assertStringReplace("ς", "Σ", "x", "UNICODE", "ς");
+ assertStringReplace("Σ", "σ", "x", "UNICODE", "Σ");
+ assertStringReplace("Σ", "ς", "x", "UNICODE", "Σ");
+ assertStringReplace("Σ", "Σ", "x", "UNICODE", "x");
+ assertStringReplace("σ", "σ", "x", "UNICODE_CI", "x");
+ assertStringReplace("σ", "ς", "x", "UNICODE_CI", "x");
+ assertStringReplace("σ", "Σ", "x", "UNICODE_CI", "x");
+ assertStringReplace("ς", "σ", "x", "UNICODE_CI", "x");
+ assertStringReplace("ς", "ς", "x", "UNICODE_CI", "x");
+ assertStringReplace("ς", "Σ", "x", "UNICODE_CI", "x");
+ assertStringReplace("Σ", "σ", "x", "UNICODE_CI", "x");
+ assertStringReplace("Σ", "ς", "x", "UNICODE_CI", "x");
+ assertStringReplace("Σ", "Σ", "x", "UNICODE_CI", "x");
+ // Surrogate pairs.
+ assertStringReplace("a🙃b", "a", "x", "UTF8_BINARY", "x🙃b");
+ assertStringReplace("a🙃b", "b", "x", "UTF8_BINARY", "a🙃x");
+ assertStringReplace("a🙃b", "🙃", "x", "UTF8_BINARY", "axb");
+ assertStringReplace("a🙃b", "b", "c", "UTF8_LCASE", "a🙃c");
+ assertStringReplace("a🙃b", "b", "x", "UTF8_LCASE", "a🙃x");
+ assertStringReplace("a🙃b", "🙃", "x", "UTF8_LCASE", "axb");
+ assertStringReplace("a🙃b", "b", "c", "UNICODE", "a🙃c");
+ assertStringReplace("a🙃b", "b", "x", "UNICODE", "a🙃x");
+ assertStringReplace("a🙃b", "🙃", "x", "UNICODE", "axb");
+ assertStringReplace("a🙃b", "b", "c", "UNICODE_CI", "a🙃c");
+ assertStringReplace("a🙃b", "b", "x", "UNICODE_CI", "a🙃x");
+ assertStringReplace("a🙃b", "🙃", "x", "UNICODE_CI", "axb");
}
- private void assertLocate(String substring, String string, Integer start, String collationName,
- Integer expected) throws SparkException {
+ /**
+ * Verify the behaviour of the `StringLocate` collation support class.
+ */
+
+ private void assertStringLocate(String substring, String string, int start,
+ String collationName, int expected) throws SparkException {
+ // Note: When using start < 1, be careful to understand the behavior of the `indexOf`
+ // method and the implications of using `indexOf` in the `StringLocate` case class.
UTF8String substr = UTF8String.fromString(substring);
UTF8String str = UTF8String.fromString(string);
int collationId = CollationFactory.collationNameToId(collationName);
- assertEquals(expected, CollationSupport.StringLocate.exec(str, substr,
- start - 1, collationId) + 1);
+ int result = CollationSupport.StringLocate.exec(str, substr, start - 1, collationId) + 1;
+ assertEquals(expected, result);
}
@Test
- public void testLocate() throws SparkException {
- // If you add tests with start < 1 be careful to understand the behavior of the indexOf method
- // and usage of indexOf in the StringLocate class.
- assertLocate("aa", "aaads", 1, "UTF8_BINARY", 1);
- assertLocate("aa", "aaads", 2, "UTF8_BINARY", 2);
- assertLocate("aa", "aaads", 3, "UTF8_BINARY", 0);
- assertLocate("Aa", "aaads", 1, "UTF8_BINARY", 0);
- assertLocate("Aa", "aAads", 1, "UTF8_BINARY", 2);
- assertLocate("界x", "test大千世界X大千世界", 1, "UTF8_BINARY", 0);
- assertLocate("界X", "test大千世界X大千世界", 1, "UTF8_BINARY", 8);
- assertLocate("界", "test大千世界X大千世界", 13, "UTF8_BINARY", 13);
- assertLocate("AA", "aaads", 1, "UTF8_LCASE", 1);
- assertLocate("aa", "aAads", 2, "UTF8_LCASE", 2);
- assertLocate("aa", "aaAds", 3, "UTF8_LCASE", 0);
- assertLocate("abC", "abcabc", 1, "UTF8_LCASE", 1);
- assertLocate("abC", "abCabc", 2, "UTF8_LCASE", 4);
- assertLocate("abc", "abcabc", 4, "UTF8_LCASE", 4);
- assertLocate("界x", "test大千世界X大千世界", 1, "UTF8_LCASE", 8);
- assertLocate("界X", "test大千世界Xtest大千世界", 1, "UTF8_LCASE", 8);
- assertLocate("界", "test大千世界X大千世界", 13, "UTF8_LCASE", 13);
- assertLocate("大千", "test大千世界大千世界", 1, "UTF8_LCASE", 5);
- assertLocate("大千", "test大千世界大千世界", 9, "UTF8_LCASE", 9);
- assertLocate("大千", "大千世界大千世界", 1, "UTF8_LCASE", 1);
- assertLocate("aa", "Aaads", 1, "UNICODE", 2);
- assertLocate("AA", "aaads", 1, "UNICODE", 0);
- assertLocate("aa", "aAads", 2, "UNICODE", 0);
- assertLocate("aa", "aaAds", 3, "UNICODE", 0);
- assertLocate("abC", "abcabc", 1, "UNICODE", 0);
- assertLocate("abC", "abCabc", 2, "UNICODE", 0);
- assertLocate("abC", "abCabC", 2, "UNICODE", 4);
- assertLocate("abc", "abcabc", 1, "UNICODE", 1);
- assertLocate("abc", "abcabc", 3, "UNICODE", 4);
- assertLocate("界x", "test大千世界X大千世界", 1, "UNICODE", 0);
- assertLocate("界X", "test大千世界X大千世界", 1, "UNICODE", 8);
- assertLocate("界", "test大千世界X大千世界", 13, "UNICODE", 13);
- assertLocate("AA", "aaads", 1, "UNICODE_CI", 1);
- assertLocate("aa", "aAads", 2, "UNICODE_CI", 2);
- assertLocate("aa", "aaAds", 3, "UNICODE_CI", 0);
- assertLocate("abC", "abcabc", 1, "UNICODE_CI", 1);
- assertLocate("abC", "abCabc", 2, "UNICODE_CI", 4);
- assertLocate("abc", "abcabc", 4, "UNICODE_CI", 4);
- assertLocate("界x", "test大千世界X大千世界", 1, "UNICODE_CI", 8);
- assertLocate("界", "test大千世界X大千世界", 13, "UNICODE_CI", 13);
- assertLocate("大千", "test大千世界大千世界", 1, "UNICODE_CI", 5);
- assertLocate("大千", "test大千世界大千世界", 9, "UNICODE_CI", 9);
- assertLocate("大千", "大千世界大千世界", 1, "UNICODE_CI", 1);
- // Case-variable character length
- assertLocate("\u0307", "i̇", 1, "UTF8_BINARY", 2);
- assertLocate("\u0307", "İ", 1, "UTF8_LCASE", 0); // != UTF8_BINARY
- assertLocate("i", "i̇", 1, "UNICODE_CI", 0);
- assertLocate("\u0307", "i̇", 1, "UNICODE_CI", 0);
- assertLocate("i̇", "i", 1, "UNICODE_CI", 0);
- assertLocate("İ", "i̇", 1, "UNICODE_CI", 1);
- assertLocate("İ", "i", 1, "UNICODE_CI", 0);
- assertLocate("i", "i̇", 1, "UTF8_LCASE", 1); // != UNICODE_CI
- assertLocate("\u0307", "i̇", 1, "UTF8_LCASE", 2); // != UNICODE_CI
- assertLocate("i̇", "i", 1, "UTF8_LCASE", 0);
- assertLocate("İ", "i̇", 1, "UTF8_LCASE", 1);
- assertLocate("İ", "i", 1, "UTF8_LCASE", 0);
- assertLocate("i̇o", "İo世界大千世界", 1, "UNICODE_CI", 1);
- assertLocate("i̇o", "大千İo世界大千世界", 1, "UNICODE_CI", 3);
- assertLocate("i̇o", "世界İo大千世界大千İo", 4, "UNICODE_CI", 11);
- assertLocate("İo", "i̇o世界大千世界", 1, "UNICODE_CI", 1);
- assertLocate("İo", "大千i̇o世界大千世界", 1, "UNICODE_CI", 3);
- assertLocate("İo", "世界i̇o大千世界大千i̇o", 4, "UNICODE_CI", 12);
+ public void testStringLocate() throws SparkException {
+ // Empty strings.
+ assertStringLocate("", "", -1, "UTF8_BINARY", 1);
+ assertStringLocate("", "", -1, "UTF8_LCASE", 1);
+ assertStringLocate("", "", -1, "UNICODE", 1);
+ assertStringLocate("", "", -1, "UNICODE_CI", 1);
+ assertStringLocate("", "", 0, "UTF8_BINARY", 1);
+ assertStringLocate("", "", 0, "UTF8_LCASE", 1);
+ assertStringLocate("", "", 0, "UNICODE", 1);
+ assertStringLocate("", "", 0, "UNICODE_CI", 1);
+ assertStringLocate("", "", 1, "UTF8_BINARY", 1);
+ assertStringLocate("", "", 1, "UTF8_LCASE", 1);
+ assertStringLocate("", "", 1, "UNICODE", 1);
+ assertStringLocate("", "", 1, "UNICODE_CI", 1);
+ assertStringLocate("a", "", -1, "UTF8_BINARY", 0);
+ assertStringLocate("a", "", -1, "UTF8_LCASE", 0);
+ assertStringLocate("a", "", -1, "UNICODE", 0);
+ assertStringLocate("a", "", -1, "UNICODE_CI", 0);
+ assertStringLocate("a", "", 0, "UTF8_BINARY", 0);
+ assertStringLocate("a", "", 0, "UTF8_LCASE", 0);
+ assertStringLocate("a", "", 0, "UNICODE", 0);
+ assertStringLocate("a", "", 0, "UNICODE_CI", 0);
+ assertStringLocate("a", "", 1, "UTF8_BINARY", 0);
+ assertStringLocate("a", "", 1, "UTF8_LCASE", 0);
+ assertStringLocate("a", "", 1, "UNICODE", 0);
+ assertStringLocate("a", "", 1, "UNICODE_CI", 0);
+ assertStringLocate("", "x", -1, "UTF8_BINARY", 1);
+ assertStringLocate("", "x", -1, "UTF8_LCASE", 1);
+ assertStringLocate("", "x", -1, "UNICODE", 1);
+ assertStringLocate("", "x", -1, "UNICODE_CI", 1);
+ assertStringLocate("", "x", 0, "UTF8_BINARY", 1);
+ assertStringLocate("", "x", 0, "UTF8_LCASE", 1);
+ assertStringLocate("", "x", 0, "UNICODE", 1);
+ assertStringLocate("", "x", 0, "UNICODE_CI", 1);
+ assertStringLocate("", "x", 1, "UTF8_BINARY", 1);
+ assertStringLocate("", "x", 1, "UTF8_LCASE", 1);
+ assertStringLocate("", "x", 1, "UNICODE", 1);
+ assertStringLocate("", "x", 1, "UNICODE_CI", 1);
+ // Basic tests.
+ assertStringLocate("aa", "aaads", 1, "UTF8_BINARY", 1);
+ assertStringLocate("aa", "aaads", 1, "UTF8_LCASE", 1);
+ assertStringLocate("aa", "aaads", 1, "UNICODE", 1);
+ assertStringLocate("aa", "aaads", 1, "UNICODE_CI", 1);
+ assertStringLocate("aa", "aaads", 2, "UTF8_BINARY", 2);
+ assertStringLocate("aa", "aaads", 2, "UTF8_LCASE", 2);
+ assertStringLocate("aa", "aaads", 2, "UNICODE", 2);
+ assertStringLocate("aa", "aaads", 2, "UNICODE_CI", 2);
+ assertStringLocate("aa", "aaads", 3, "UTF8_BINARY", 0);
+ assertStringLocate("aa", "aaads", 3, "UTF8_LCASE", 0);
+ assertStringLocate("aa", "aaads", 3, "UNICODE", 0);
+ assertStringLocate("aa", "aaads", 3, "UNICODE_CI", 0);
+ assertStringLocate("Aa", "aaads", 1, "UTF8_BINARY", 0);
+ assertStringLocate("Aa", "aaads", 1, "UTF8_LCASE", 1);
+ assertStringLocate("Aa", "aaads", 1, "UNICODE", 0);
+ assertStringLocate("Aa", "aaads", 1, "UNICODE_CI", 1);
+ assertStringLocate("Aa", "aaads", 2, "UTF8_BINARY", 0);
+ assertStringLocate("Aa", "aaads", 2, "UTF8_LCASE", 2);
+ assertStringLocate("Aa", "aaads", 2, "UNICODE", 0);
+ assertStringLocate("Aa", "aaads", 2, "UNICODE_CI", 2);
+ assertStringLocate("Aa", "aaads", 3, "UTF8_BINARY", 0);
+ assertStringLocate("Aa", "aaads", 3, "UTF8_LCASE", 0);
+ assertStringLocate("Aa", "aaads", 3, "UNICODE", 0);
+ assertStringLocate("Aa", "aaads", 3, "UNICODE_CI", 0);
+ assertStringLocate("Aa", "aAads", 1, "UTF8_BINARY", 2);
+ assertStringLocate("Aa", "aAads", 1, "UTF8_LCASE", 1);
+ assertStringLocate("Aa", "aAads", 1, "UNICODE", 2);
+ assertStringLocate("Aa", "aAads", 1, "UNICODE_CI", 1);
+ assertStringLocate("AA", "aaads", 1, "UTF8_BINARY", 0);
+ assertStringLocate("AA", "aaads", 1, "UTF8_LCASE", 1);
+ assertStringLocate("AA", "aaads", 1, "UNICODE", 0);
+ assertStringLocate("AA", "aaads", 1, "UNICODE_CI", 1);
+ assertStringLocate("aa", "aAads", 2, "UTF8_BINARY", 0);
+ assertStringLocate("aa", "aAads", 2, "UTF8_LCASE", 2);
+ assertStringLocate("aa", "aAads", 2, "UNICODE", 0);
+ assertStringLocate("aa", "aAads", 2, "UNICODE_CI", 2);
+ assertStringLocate("aa", "aaAds", 3, "UTF8_BINARY", 0);
+ assertStringLocate("aa", "aaAds", 3, "UTF8_LCASE", 0);
+ assertStringLocate("aa", "aaAds", 3, "UNICODE", 0);
+ assertStringLocate("aa", "aaAds", 3, "UNICODE_CI", 0);
+ assertStringLocate("abC", "abcabc", 1, "UTF8_BINARY", 0);
+ assertStringLocate("abC", "abcabc", 1, "UTF8_LCASE", 1);
+ assertStringLocate("abC", "abcabc", 1, "UNICODE", 0);
+ assertStringLocate("abC", "abcabc", 1, "UNICODE_CI", 1);
+ assertStringLocate("abC", "abCabc", 2, "UTF8_BINARY", 0);
+ assertStringLocate("abC", "abCabc", 2, "UTF8_LCASE", 4);
+ assertStringLocate("abC", "abCabc", 2, "UNICODE", 0);
+ assertStringLocate("abC", "abCabc", 2, "UNICODE_CI", 4);
+ assertStringLocate("abc", "abcabc", 1, "UTF8_BINARY", 1);
+ assertStringLocate("abc", "abcabc", 1, "UTF8_LCASE", 1);
+ assertStringLocate("abc", "abcabc", 1, "UNICODE", 1);
+ assertStringLocate("abc", "abcabc", 1, "UNICODE_CI", 1);
+ assertStringLocate("abc", "abcabc", 2, "UTF8_BINARY", 4);
+ assertStringLocate("abc", "abcabc", 2, "UTF8_LCASE", 4);
+ assertStringLocate("abc", "abcabc", 2, "UNICODE", 4);
+ assertStringLocate("abc", "abcabc", 2, "UNICODE_CI", 4);
+ assertStringLocate("abc", "abcabc", 3, "UTF8_BINARY", 4);
+ assertStringLocate("abc", "abcabc", 3, "UTF8_LCASE", 4);
+ assertStringLocate("abc", "abcabc", 3, "UNICODE", 4);
+ assertStringLocate("abc", "abcabc", 3, "UNICODE_CI", 4);
+ assertStringLocate("abc", "abcabc", 4, "UTF8_BINARY", 4);
+ assertStringLocate("abc", "abcabc", 4, "UTF8_LCASE", 4);
+ assertStringLocate("abc", "abcabc", 4, "UNICODE", 4);
+ assertStringLocate("abc", "abcabc", 4, "UNICODE_CI", 4);
+ assertStringLocate("aa", "Aaads", 1, "UTF8_BINARY", 2);
+ assertStringLocate("aa", "Aaads", 1, "UTF8_LCASE", 1);
+ assertStringLocate("aa", "Aaads", 1, "UNICODE", 2);
+ assertStringLocate("aa", "Aaads", 1, "UNICODE_CI", 1);
+ assertStringLocate("ćČ", "CćČČćCČĆČcČcććČč", 3, "SR", 14);
+ assertStringLocate("ćČ", "CćČČćCČĆČcČcććČč", 3, "SR_CI_AI", 3);
+ // Advanced tests.
+ assertStringLocate("界x", "test大千世界X大千世界", 1, "UTF8_BINARY", 0);
+ assertStringLocate("界X", "test大千世界X大千世界", 1, "UTF8_BINARY", 8);
+ assertStringLocate("界", "test大千世界X大千世界", 13, "UTF8_BINARY", 13);
+ assertStringLocate("界x", "test大千世界X大千世界", 1, "UTF8_LCASE", 8);
+ assertStringLocate("界X", "test大千世界Xtest大千世界", 1, "UTF8_LCASE", 8);
+ assertStringLocate("界", "test大千世界X大千世界", 13, "UTF8_LCASE", 13);
+ assertStringLocate("大千", "test大千世界大千世界", 1, "UTF8_LCASE", 5);
+ assertStringLocate("大千", "test大千世界大千世界", 9, "UTF8_LCASE", 9);
+ assertStringLocate("大千", "大千世界大千世界", 1, "UTF8_LCASE", 1);
+ assertStringLocate("界x", "test大千世界X大千世界", 1, "UNICODE", 0);
+ assertStringLocate("界X", "test大千世界X大千世界", 1, "UNICODE", 8);
+ assertStringLocate("界", "test大千世界X大千世界", 13, "UNICODE", 13);
+ assertStringLocate("界x", "test大千世界X大千世界", 1, "UNICODE_CI", 8);
+ assertStringLocate("界", "test大千世界X大千世界", 13, "UNICODE_CI", 13);
+ assertStringLocate("大千", "test大千世界大千世界", 1, "UNICODE_CI", 5);
+ assertStringLocate("大千", "test大千世界大千世界", 9, "UNICODE_CI", 9);
+ assertStringLocate("大千", "大千世界大千世界", 1, "UNICODE_CI", 1);
+ // One-to-many case mapping (e.g. Turkish dotted I).
+ assertStringLocate("\u0307", "i\u0307", 1, "UTF8_BINARY", 2);
+ assertStringLocate("\u0307", "İ", 1, "UTF8_LCASE", 0); // != UTF8_BINARY
+ assertStringLocate("i", "i\u0307", 1, "UNICODE_CI", 0);
+ assertStringLocate("\u0307", "i\u0307", 1, "UNICODE_CI", 0);
+ assertStringLocate("i\u0307", "i", 1, "UNICODE_CI", 0);
+ assertStringLocate("İ", "i\u0307", 1, "UNICODE_CI", 1);
+ assertStringLocate("İ", "i", 1, "UNICODE_CI", 0);
+ assertStringLocate("i", "i\u0307", 1, "UTF8_LCASE", 1); // != UNICODE_CI
+ assertStringLocate("\u0307", "i\u0307", 1, "UTF8_LCASE", 2); // != UNICODE_CI
+ assertStringLocate("i\u0307", "i", 1, "UTF8_LCASE", 0);
+ assertStringLocate("İ", "i\u0307", 1, "UTF8_LCASE", 1);
+ assertStringLocate("İ", "i", 1, "UTF8_LCASE", 0);
+ assertStringLocate("i\u0307o", "İo世界大千世界", 1, "UNICODE_CI", 1);
+ assertStringLocate("i\u0307o", "大千İo世界大千世界", 1, "UNICODE_CI", 3);
+ assertStringLocate("i\u0307o", "世界İo大千世界大千İo", 4, "UNICODE_CI", 11);
+ assertStringLocate("İo", "i̇o世界大千世界", 1, "UNICODE_CI", 1);
+ assertStringLocate("İo", "大千i̇o世界大千世界", 1, "UNICODE_CI", 3);
+ assertStringLocate("İo", "世界i̇o大千世界大千i̇o", 4, "UNICODE_CI", 12);
+ // Conditional case mapping (e.g. Greek sigmas).
+ assertStringLocate("σ", "σ", 1, "UTF8_BINARY", 1);
+ assertStringLocate("σ", "ς", 1, "UTF8_BINARY", 0);
+ assertStringLocate("σ", "Σ", 1, "UTF8_BINARY", 0);
+ assertStringLocate("ς", "σ", 1, "UTF8_BINARY", 0);
+ assertStringLocate("ς", "ς", 1, "UTF8_BINARY", 1);
+ assertStringLocate("ς", "Σ", 1, "UTF8_BINARY", 0);
+ assertStringLocate("Σ", "σ", 1, "UTF8_BINARY", 0);
+ assertStringLocate("Σ", "ς", 1, "UTF8_BINARY", 0);
+ assertStringLocate("Σ", "Σ", 1, "UTF8_BINARY", 1);
+ assertStringLocate("σ", "σ", 1, "UTF8_LCASE", 1);
+ assertStringLocate("σ", "ς", 1, "UTF8_LCASE", 1);
+ assertStringLocate("σ", "Σ", 1, "UTF8_LCASE", 1);
+ assertStringLocate("ς", "σ", 1, "UTF8_LCASE", 1);
+ assertStringLocate("ς", "ς", 1, "UTF8_LCASE", 1);
+ assertStringLocate("ς", "Σ", 1, "UTF8_LCASE", 1);
+ assertStringLocate("Σ", "σ", 1, "UTF8_LCASE", 1);
+ assertStringLocate("Σ", "ς", 1, "UTF8_LCASE", 1);
+ assertStringLocate("Σ", "Σ", 1, "UTF8_LCASE", 1);
+ assertStringLocate("σ", "σ", 1, "UNICODE", 1);
+ assertStringLocate("σ", "ς", 1, "UNICODE", 0);
+ assertStringLocate("σ", "Σ", 1, "UNICODE", 0);
+ assertStringLocate("ς", "σ", 1, "UNICODE", 0);
+ assertStringLocate("ς", "ς", 1, "UNICODE", 1);
+ assertStringLocate("ς", "Σ", 1, "UNICODE", 0);
+ assertStringLocate("Σ", "σ", 1, "UNICODE", 0);
+ assertStringLocate("Σ", "ς", 1, "UNICODE", 0);
+ assertStringLocate("Σ", "Σ", 1, "UNICODE", 1);
+ assertStringLocate("σ", "σ", 1, "UNICODE_CI", 1);
+ assertStringLocate("σ", "ς", 1, "UNICODE_CI", 1);
+ assertStringLocate("σ", "Σ", 1, "UNICODE_CI", 1);
+ assertStringLocate("ς", "σ", 1, "UNICODE_CI", 1);
+ assertStringLocate("ς", "ς", 1, "UNICODE_CI", 1);
+ assertStringLocate("ς", "Σ", 1, "UNICODE_CI", 1);
+ assertStringLocate("Σ", "σ", 1, "UNICODE_CI", 1);
+ assertStringLocate("Σ", "ς", 1, "UNICODE_CI", 1);
+ assertStringLocate("Σ", "Σ", 1, "UNICODE_CI", 1);
+ // Surrogate pairs.
+ assertStringLocate("a", "a🙃b", 1, "UTF8_BINARY", 1);
+ assertStringLocate("a", "a🙃b", 1, "UTF8_LCASE", 1);
+ assertStringLocate("a", "a🙃b", 1, "UNICODE", 1);
+ assertStringLocate("a", "a🙃b", 1, "UNICODE_CI", 1);
+ assertStringLocate("a", "a🙃b", 2, "UTF8_BINARY", 0);
+ assertStringLocate("a", "a🙃b", 2, "UTF8_LCASE", 0);
+ assertStringLocate("a", "a🙃b", 2, "UNICODE", 0);
+ assertStringLocate("a", "a🙃b", 2, "UNICODE_CI", 0);
+ assertStringLocate("a", "a🙃b", 3, "UTF8_BINARY", 0);
+ assertStringLocate("a", "a🙃b", 3, "UTF8_LCASE", 0);
+ assertStringLocate("a", "a🙃b", 3, "UNICODE", 0);
+ assertStringLocate("a", "a🙃b", 3, "UNICODE_CI", 0);
+ assertStringLocate("🙃", "a🙃b", 1, "UTF8_BINARY", 2);
+ assertStringLocate("🙃", "a🙃b", 1, "UTF8_LCASE", 2);
+ assertStringLocate("🙃", "a🙃b", 1, "UNICODE", 2);
+ assertStringLocate("🙃", "a🙃b", 1, "UNICODE_CI", 2);
+ assertStringLocate("🙃", "a🙃b", 2, "UTF8_BINARY", 2);
+ assertStringLocate("🙃", "a🙃b", 2, "UTF8_LCASE", 2);
+ assertStringLocate("🙃", "a🙃b", 2, "UNICODE", 2);
+ assertStringLocate("🙃", "a🙃b", 2, "UNICODE_CI", 2);
+ assertStringLocate("🙃", "a🙃b", 3, "UTF8_BINARY", 0);
+ assertStringLocate("🙃", "a🙃b", 3, "UTF8_LCASE", 0);
+ assertStringLocate("🙃", "a🙃b", 3, "UNICODE", 0);
+ assertStringLocate("🙃", "a🙃b", 3, "UNICODE_CI", 0);
+ assertStringLocate("b", "a🙃b", 1, "UTF8_BINARY", 3);
+ assertStringLocate("b", "a🙃b", 1, "UTF8_LCASE", 3);
+ assertStringLocate("b", "a🙃b", 1, "UNICODE", 3);
+ assertStringLocate("b", "a🙃b", 1, "UNICODE_CI", 3);
+ assertStringLocate("b", "a🙃b", 2, "UTF8_BINARY", 3);
+ assertStringLocate("b", "a🙃b", 2, "UTF8_LCASE", 3);
+ assertStringLocate("b", "a🙃b", 2, "UNICODE", 3);
+ assertStringLocate("b", "a🙃b", 2, "UNICODE_CI", 3);
+ assertStringLocate("b", "a🙃b", 3, "UTF8_BINARY", 3);
+ assertStringLocate("b", "a🙃b", 3, "UTF8_LCASE", 3);
+ assertStringLocate("b", "a🙃b", 3, "UNICODE", 3);
+ assertStringLocate("b", "a🙃b", 3, "UNICODE_CI", 3);
+ assertStringLocate("🙃", "a🙃🙃b", 1, "UTF8_BINARY", 2);
+ assertStringLocate("🙃", "a🙃🙃b", 1, "UTF8_LCASE", 2);
+ assertStringLocate("🙃", "a🙃🙃b", 1, "UNICODE", 2);
+ assertStringLocate("🙃", "a🙃🙃b", 1, "UNICODE_CI", 2);
+ assertStringLocate("🙃", "a🙃🙃b", 2, "UTF8_BINARY", 2);
+ assertStringLocate("🙃", "a🙃🙃b", 2, "UTF8_LCASE", 2);
+ assertStringLocate("🙃", "a🙃🙃b", 2, "UNICODE", 2);
+ assertStringLocate("🙃", "a🙃🙃b", 2, "UNICODE_CI", 2);
+ assertStringLocate("🙃", "a🙃🙃b", 3, "UTF8_BINARY", 3);
+ assertStringLocate("🙃", "a🙃🙃b", 3, "UTF8_LCASE", 3);
+ assertStringLocate("🙃", "a🙃🙃b", 3, "UNICODE", 3);
+ assertStringLocate("🙃", "a🙃🙃b", 3, "UNICODE_CI", 3);
+ assertStringLocate("🙃", "a🙃🙃b", 4, "UTF8_BINARY", 0);
+ assertStringLocate("🙃", "a🙃🙃b", 4, "UTF8_LCASE", 0);
+ assertStringLocate("🙃", "a🙃🙃b", 4, "UNICODE", 0);
+ assertStringLocate("🙃", "a🙃🙃b", 4, "UNICODE_CI", 0);
+ assertStringLocate("b", "a🙃🙃b", 1, "UTF8_BINARY", 4);
+ assertStringLocate("b", "a🙃🙃b", 1, "UTF8_LCASE", 4);
+ assertStringLocate("b", "a🙃🙃b", 1, "UNICODE", 4);
+ assertStringLocate("b", "a🙃🙃b", 1, "UNICODE_CI", 4);
+ assertStringLocate("b", "a🙃🙃b", 2, "UTF8_BINARY", 4);
+ assertStringLocate("b", "a🙃🙃b", 2, "UTF8_LCASE", 4);
+ assertStringLocate("b", "a🙃🙃b", 2, "UNICODE", 4);
+ assertStringLocate("b", "a🙃🙃b", 2, "UNICODE_CI", 4);
+ assertStringLocate("b", "a🙃🙃b", 3, "UTF8_BINARY", 4);
+ assertStringLocate("b", "a🙃🙃b", 3, "UTF8_LCASE", 4);
+ assertStringLocate("b", "a🙃🙃b", 3, "UNICODE", 4);
+ assertStringLocate("b", "a🙃🙃b", 3, "UNICODE_CI", 4);
+ assertStringLocate("b", "a🙃🙃b", 4, "UTF8_BINARY", 4);
+ assertStringLocate("b", "a🙃🙃b", 4, "UTF8_LCASE", 4);
+ assertStringLocate("b", "a🙃🙃b", 4, "UNICODE", 4);
+ assertStringLocate("b", "a🙃🙃b", 4, "UNICODE_CI", 4);
+ assertStringLocate("b", "a🙃x🙃b", 1, "UTF8_BINARY", 5);
+ assertStringLocate("b", "a🙃x🙃b", 1, "UTF8_LCASE", 5);
+ assertStringLocate("b", "a🙃x🙃b", 1, "UNICODE", 5);
+ assertStringLocate("b", "a🙃x🙃b", 1, "UNICODE_CI", 5);
+ assertStringLocate("b", "a🙃x🙃b", 2, "UTF8_BINARY", 5);
+ assertStringLocate("b", "a🙃x🙃b", 2, "UTF8_LCASE", 5);
+ assertStringLocate("b", "a🙃x🙃b", 2, "UNICODE", 5);
+ assertStringLocate("b", "a🙃x🙃b", 2, "UNICODE_CI", 5);
+ assertStringLocate("b", "a🙃x🙃b", 3, "UTF8_BINARY", 5);
+ assertStringLocate("b", "a🙃x🙃b", 3, "UTF8_LCASE", 5);
+ assertStringLocate("b", "a🙃x🙃b", 3, "UNICODE", 5);
+ assertStringLocate("b", "a🙃x🙃b", 3, "UNICODE_CI", 5);
+ assertStringLocate("b", "a🙃x🙃b", 4, "UTF8_BINARY", 5);
+ assertStringLocate("b", "a🙃x🙃b", 4, "UTF8_LCASE", 5);
+ assertStringLocate("b", "a🙃x🙃b", 4, "UNICODE", 5);
+ assertStringLocate("b", "a🙃x🙃b", 4, "UNICODE_CI", 5);
+ // Out of bounds test cases.
+ assertStringLocate("a", "asd", 4, "UTF8_BINARY", 0);
+ assertStringLocate("a", "asd", 4, "UTF8_LCASE", 0);
+ assertStringLocate("a", "asd", 4, "UNICODE", 0);
+ assertStringLocate("a", "asd", 4, "UNICODE_CI", 0);
+ assertStringLocate("a", "asd", 100, "UTF8_BINARY", 0);
+ assertStringLocate("a", "asd", 100, "UTF8_LCASE", 0);
+ assertStringLocate("a", "asd", 100, "UNICODE", 0);
+ assertStringLocate("a", "asd", 100, "UNICODE_CI", 0);
+ assertStringLocate("a", "🙃🙃", 4, "UTF8_BINARY", 0);
+ assertStringLocate("a", "🙃🙃", 4, "UTF8_LCASE", 0);
+ assertStringLocate("a", "🙃🙃", 4, "UNICODE", 0);
+ assertStringLocate("a", "🙃🙃", 4, "UNICODE_CI", 0);
+ assertStringLocate("", "asd", 100, "UTF8_BINARY", 1);
+ assertStringLocate("", "asd", 100, "UTF8_LCASE", 1);
+ assertStringLocate("", "asd", 100, "UNICODE", 1);
+ assertStringLocate("", "asd", 100, "UNICODE_CI", 1);
+ assertStringLocate("asd", "", 100, "UTF8_BINARY", 0);
+ assertStringLocate("asd", "", 100, "UTF8_LCASE", 0);
+ assertStringLocate("asd", "", 100, "UNICODE", 0);
+ assertStringLocate("asd", "", 100, "UNICODE_CI", 0);
}
- private void assertSubstringIndex(String string, String delimiter, Integer count,
- String collationName, String expected) throws SparkException {
+ /**
+ * Verify the behaviour of the `SubstringIndex` collation support class.
+ */
+
+ private void assertSubstringIndex(String string, String delimiter, int count,
+ String collationName, String expected) throws SparkException {
UTF8String str = UTF8String.fromString(string);
UTF8String delim = UTF8String.fromString(delimiter);
int collationId = CollationFactory.collationNameToId(collationName);
- assertEquals(expected,
- CollationSupport.SubstringIndex.exec(str, delim, count, collationId).toString());
+ UTF8String result = CollationSupport.SubstringIndex.exec(str, delim, count, collationId);
+ assertEquals(UTF8String.fromString(expected), result);
}
@Test
public void testSubstringIndex() throws SparkException {
+ // Empty strings.
+ assertSubstringIndex("", "", 0, "UTF8_BINARY", "");
+ assertSubstringIndex("", "", 0, "UTF8_LCASE", "");
+ assertSubstringIndex("", "", 0, "UNICODE", "");
+ assertSubstringIndex("", "", 0, "UNICODE_CI", "");
+ assertSubstringIndex("", "", 1, "UTF8_BINARY", "");
+ assertSubstringIndex("", "", 1, "UTF8_LCASE", "");
+ assertSubstringIndex("", "", 1, "UNICODE", "");
+ assertSubstringIndex("", "", 1, "UNICODE_CI", "");
+ assertSubstringIndex("", "", -1, "UTF8_BINARY", "");
+ assertSubstringIndex("", "", -1, "UTF8_LCASE", "");
+ assertSubstringIndex("", "", -1, "UNICODE", "");
+ assertSubstringIndex("", "", -1, "UNICODE_CI", "");
+ assertSubstringIndex("", "x", 0, "UTF8_BINARY", "");
+ assertSubstringIndex("", "x", 0, "UTF8_LCASE", "");
+ assertSubstringIndex("", "x", 0, "UNICODE", "");
+ assertSubstringIndex("", "x", 0, "UNICODE_CI", "");
+ assertSubstringIndex("", "x", 1, "UTF8_BINARY", "");
+ assertSubstringIndex("", "x", 1, "UTF8_LCASE", "");
+ assertSubstringIndex("", "x", 1, "UNICODE", "");
+ assertSubstringIndex("", "x", 1, "UNICODE_CI", "");
+ assertSubstringIndex("", "x", -1, "UTF8_BINARY", "");
+ assertSubstringIndex("", "x", -1, "UTF8_LCASE", "");
+ assertSubstringIndex("", "x", -1, "UNICODE", "");
+ assertSubstringIndex("", "x", -1, "UNICODE_CI", "");
+ assertSubstringIndex("abc", "", 0, "UTF8_BINARY", "");
+ assertSubstringIndex("abc", "", 0, "UTF8_LCASE", "");
+ assertSubstringIndex("abc", "", 0, "UNICODE", "");
+ assertSubstringIndex("abc", "", 0, "UNICODE_CI", "");
+ assertSubstringIndex("abc", "", 1, "UTF8_BINARY", "");
+ assertSubstringIndex("abc", "", 1, "UTF8_LCASE", "");
+ assertSubstringIndex("abc", "", 1, "UNICODE", "");
+ assertSubstringIndex("abc", "", 1, "UNICODE_CI", "");
+ assertSubstringIndex("abc", "", -1, "UTF8_BINARY", "");
+ assertSubstringIndex("abc", "", -1, "UTF8_LCASE", "");
+ assertSubstringIndex("abc", "", -1, "UNICODE", "");
+ assertSubstringIndex("abc", "", -1, "UNICODE_CI", "");
+ // Basic tests.
+ assertSubstringIndex("axbxc", "a", 1, "UTF8_BINARY", "");
+ assertSubstringIndex("axbxc", "a", 1, "UTF8_LCASE", "");
+ assertSubstringIndex("axbxc", "a", 1, "UNICODE", "");
+ assertSubstringIndex("axbxc", "a", 1, "UNICODE_CI", "");
+ assertSubstringIndex("axbxc", "x", 1, "UTF8_BINARY", "a");
+ assertSubstringIndex("axbxc", "x", 1, "UTF8_LCASE", "a");
+ assertSubstringIndex("axbxc", "x", 1, "UNICODE", "a");
+ assertSubstringIndex("axbxc", "x", 1, "UNICODE_CI", "a");
+ assertSubstringIndex("axbxc", "b", 1, "UTF8_BINARY", "ax");
+ assertSubstringIndex("axbxc", "b", 1, "UTF8_LCASE", "ax");
+ assertSubstringIndex("axbxc", "b", 1, "UNICODE", "ax");
+ assertSubstringIndex("axbxc", "b", 1, "UNICODE_CI", "ax");
+ assertSubstringIndex("axbxc", "x", 2, "UTF8_BINARY", "axb");
+ assertSubstringIndex("axbxc", "x", 2, "UTF8_LCASE", "axb");
+ assertSubstringIndex("axbxc", "x", 2, "UNICODE", "axb");
+ assertSubstringIndex("axbxc", "x", 2, "UNICODE_CI", "axb");
+ assertSubstringIndex("axbxc", "c", 1, "UTF8_BINARY", "axbx");
+ assertSubstringIndex("axbxc", "c", 1, "UTF8_LCASE", "axbx");
+ assertSubstringIndex("axbxc", "c", 1, "UNICODE", "axbx");
+ assertSubstringIndex("axbxc", "c", 1, "UNICODE_CI", "axbx");
+ assertSubstringIndex("axbxc", "x", 3, "UTF8_BINARY", "axbxc");
+ assertSubstringIndex("axbxc", "x", 3, "UTF8_LCASE", "axbxc");
+ assertSubstringIndex("axbxc", "x", 3, "UNICODE", "axbxc");
+ assertSubstringIndex("axbxc", "x", 3, "UNICODE_CI", "axbxc");
+ assertSubstringIndex("axbxc", "d", 1, "UTF8_BINARY", "axbxc");
+ assertSubstringIndex("axbxc", "d", 1, "UTF8_LCASE", "axbxc");
+ assertSubstringIndex("axbxc", "d", 1, "UNICODE", "axbxc");
+ assertSubstringIndex("axbxc", "d", 1, "UNICODE_CI", "axbxc");
+ assertSubstringIndex("axbxc", "c", -1, "UTF8_BINARY", "");
+ assertSubstringIndex("axbxc", "c", -1, "UTF8_LCASE", "");
+ assertSubstringIndex("axbxc", "c", -1, "UNICODE", "");
+ assertSubstringIndex("axbxc", "c", -1, "UNICODE_CI", "");
+ assertSubstringIndex("axbxc", "x", -1, "UTF8_BINARY", "c");
+ assertSubstringIndex("axbxc", "x", -1, "UTF8_LCASE", "c");
+ assertSubstringIndex("axbxc", "x", -1, "UNICODE", "c");
+ assertSubstringIndex("axbxc", "x", -1, "UNICODE_CI", "c");
+ assertSubstringIndex("axbxc", "b", -1, "UTF8_BINARY", "xc");
+ assertSubstringIndex("axbxc", "b", -1, "UTF8_LCASE", "xc");
+ assertSubstringIndex("axbxc", "b", -1, "UNICODE", "xc");
+ assertSubstringIndex("axbxc", "b", -1, "UNICODE_CI", "xc");
+ assertSubstringIndex("axbxc", "x", -2, "UTF8_BINARY", "bxc");
+ assertSubstringIndex("axbxc", "x", -2, "UTF8_LCASE", "bxc");
+ assertSubstringIndex("axbxc", "x", -2, "UNICODE", "bxc");
+ assertSubstringIndex("axbxc", "x", -2, "UNICODE_CI", "bxc");
+ assertSubstringIndex("axbxc", "a", -1, "UTF8_BINARY", "xbxc");
+ assertSubstringIndex("axbxc", "a", -1, "UTF8_LCASE", "xbxc");
+ assertSubstringIndex("axbxc", "a", -1, "UNICODE", "xbxc");
+ assertSubstringIndex("axbxc", "a", -1, "UNICODE_CI", "xbxc");
+ assertSubstringIndex("axbxc", "x", -3, "UTF8_BINARY", "axbxc");
+ assertSubstringIndex("axbxc", "x", -3, "UTF8_LCASE", "axbxc");
+ assertSubstringIndex("axbxc", "x", -3, "UNICODE", "axbxc");
+ assertSubstringIndex("axbxc", "x", -3, "UNICODE_CI", "axbxc");
+ assertSubstringIndex("axbxc", "d", -1, "UTF8_BINARY", "axbxc");
+ assertSubstringIndex("axbxc", "d", -1, "UTF8_LCASE", "axbxc");
+ assertSubstringIndex("axbxc", "d", -1, "UNICODE", "axbxc");
+ assertSubstringIndex("axbxc", "d", -1, "UNICODE_CI", "axbxc");
+ // Advanced tests.
assertSubstringIndex("wwwgapachegorg", "g", -3, "UTF8_BINARY", "apachegorg");
assertSubstringIndex("www||apache||org", "||", 2, "UTF8_BINARY", "www||apache");
assertSubstringIndex("aaaaaaaaaa", "aa", 2, "UTF8_BINARY", "a");
@@ -1182,8 +2602,10 @@ public void testSubstringIndex() throws SparkException {
assertSubstringIndex("test大千世界X大千世界", "X", 1, "UNICODE_CI", "test大千世界");
assertSubstringIndex("test大千世界大千世界", "千", 2, "UNICODE_CI", "test大千世界大");
assertSubstringIndex("www||APACHE||org", "||", 2, "UNICODE_CI", "www||APACHE");
- assertSubstringIndex("abİo12", "i̇o", 1, "UNICODE_CI", "ab");
- assertSubstringIndex("abİo12", "i̇o", -1, "UNICODE_CI", "12");
+ assertSubstringIndex("wwwèapacheËorg", "Ê", -3, "AF_CI_AI", "apacheËorg");
+ // One-to-many case mapping (e.g. Turkish dotted I).
+ assertSubstringIndex("abİo12", "i\u0307o", 1, "UNICODE_CI", "ab");
+ assertSubstringIndex("abİo12", "i\u0307o", -1, "UNICODE_CI", "12");
assertSubstringIndex("abi̇o12", "İo", 1, "UNICODE_CI", "ab");
assertSubstringIndex("abi̇o12", "İo", -1, "UNICODE_CI", "12");
assertSubstringIndex("ai̇bi̇o12", "İo", 1, "UNICODE_CI", "ai̇b");
@@ -1191,59 +2613,153 @@ public void testSubstringIndex() throws SparkException {
assertSubstringIndex("ai̇bi̇o12i̇o", "İo", -1, "UNICODE_CI", "");
assertSubstringIndex("ai̇bi̇o12i̇o", "İo", -2, "UNICODE_CI", "12i̇o");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UNICODE_CI", "İo12İoi̇o");
- assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o");
+ assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i\u0307o", -4, "UNICODE_CI", "İo12İoi̇o");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo");
- assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo");
+ assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i\u0307o", -4, "UNICODE_CI", "i̇o12i̇oİo");
assertSubstringIndex("abi̇12", "i", 1, "UNICODE_CI", "abi̇12");
assertSubstringIndex("abi̇12", "\u0307", 1, "UNICODE_CI", "abi̇12");
assertSubstringIndex("abi̇12", "İ", 1, "UNICODE_CI", "ab");
assertSubstringIndex("abİ12", "i", 1, "UNICODE_CI", "abİ12");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UNICODE_CI", "İo12İoi̇o");
- assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o");
+ assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i\u0307o", -4, "UNICODE_CI", "İo12İoi̇o");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo");
- assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo");
+ assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i\u0307o", -4, "UNICODE_CI", "i̇o12i̇oİo");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UNICODE_CI", "ai̇bi̇oİo12");
- assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UNICODE_CI", "ai̇bi̇oİo12");
+ assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i\u0307o", 3, "UNICODE_CI", "ai̇bi̇oİo12");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UNICODE_CI", "ai̇bİoi̇o12");
- assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UNICODE_CI", "ai̇bİoi̇o12");
+ assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i\u0307o", 3, "UNICODE_CI", "ai̇bİoi̇o12");
assertSubstringIndex("abi̇12", "i", 1, "UTF8_LCASE", "ab"); // != UNICODE_CI
assertSubstringIndex("abi̇12", "\u0307", 1, "UTF8_LCASE", "abi"); // != UNICODE_CI
assertSubstringIndex("abi̇12", "İ", 1, "UTF8_LCASE", "ab");
assertSubstringIndex("abİ12", "i", 1, "UTF8_LCASE", "abİ12");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UTF8_LCASE", "İo12İoi̇o");
- assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UTF8_LCASE", "İo12İoi̇o");
+ assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i\u0307o", -4, "UTF8_LCASE", "İo12İoi̇o");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UTF8_LCASE", "i̇o12i̇oİo");
- assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UTF8_LCASE", "i̇o12i̇oİo");
+ assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i\u0307o", -4, "UTF8_LCASE", "i̇o12i̇oİo");
assertSubstringIndex("bİoi̇o12i̇o", "\u0307oi", 1, "UTF8_LCASE", "bİoi̇o12i̇o");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UTF8_LCASE", "ai̇bi̇oİo12");
- assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UTF8_LCASE", "ai̇bi̇oİo12");
+ assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i\u0307o", 3, "UTF8_LCASE", "ai̇bi̇oİo12");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UTF8_LCASE", "ai̇bİoi̇o12");
- assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UTF8_LCASE", "ai̇bİoi̇o12");
+ assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i\u0307o", 3, "UTF8_LCASE", "ai̇bİoi̇o12");
assertSubstringIndex("bİoi̇o12i̇o", "\u0307oi", 1, "UTF8_LCASE", "bİoi̇o12i̇o");
+ // Conditional case mapping (e.g. Greek sigmas).
+ assertSubstringIndex("σ", "σ", 1, "UTF8_BINARY", "");
+ assertSubstringIndex("σ", "ς", 1, "UTF8_BINARY", "σ");
+ assertSubstringIndex("σ", "Σ", 1, "UTF8_BINARY", "σ");
+ assertSubstringIndex("ς", "σ", 1, "UTF8_BINARY", "ς");
+ assertSubstringIndex("ς", "ς", 1, "UTF8_BINARY", "");
+ assertSubstringIndex("ς", "Σ", 1, "UTF8_BINARY", "ς");
+ assertSubstringIndex("Σ", "σ", 1, "UTF8_BINARY", "Σ");
+ assertSubstringIndex("Σ", "ς", 1, "UTF8_BINARY", "Σ");
+ assertSubstringIndex("Σ", "Σ", 1, "UTF8_BINARY", "");
+ assertSubstringIndex("σ", "σ", 1, "UTF8_LCASE", "");
+ assertSubstringIndex("σ", "ς", 1, "UTF8_LCASE", "");
+ assertSubstringIndex("σ", "Σ", 1, "UTF8_LCASE", "");
+ assertSubstringIndex("ς", "σ", 1, "UTF8_LCASE", "");
+ assertSubstringIndex("ς", "ς", 1, "UTF8_LCASE", "");
+ assertSubstringIndex("ς", "Σ", 1, "UTF8_LCASE", "");
+ assertSubstringIndex("Σ", "σ", 1, "UTF8_LCASE", "");
+ assertSubstringIndex("Σ", "ς", 1, "UTF8_LCASE", "");
+ assertSubstringIndex("Σ", "Σ", 1, "UTF8_LCASE", "");
+ assertSubstringIndex("σ", "σ", 1, "UNICODE", "");
+ assertSubstringIndex("σ", "ς", 1, "UNICODE", "σ");
+ assertSubstringIndex("σ", "Σ", 1, "UNICODE", "σ");
+ assertSubstringIndex("ς", "σ", 1, "UNICODE", "ς");
+ assertSubstringIndex("ς", "ς", 1, "UNICODE", "");
+ assertSubstringIndex("ς", "Σ", 1, "UNICODE", "ς");
+ assertSubstringIndex("Σ", "σ", 1, "UNICODE", "Σ");
+ assertSubstringIndex("Σ", "ς", 1, "UNICODE", "Σ");
+ assertSubstringIndex("Σ", "Σ", 1, "UNICODE", "");
+ assertSubstringIndex("σ", "σ", 1, "UNICODE_CI", "");
+ assertSubstringIndex("σ", "ς", 1, "UNICODE_CI", "");
+ assertSubstringIndex("σ", "Σ", 1, "UNICODE_CI", "");
+ assertSubstringIndex("ς", "σ", 1, "UNICODE_CI", "");
+ assertSubstringIndex("ς", "ς", 1, "UNICODE_CI", "");
+ assertSubstringIndex("ς", "Σ", 1, "UNICODE_CI", "");
+ assertSubstringIndex("Σ", "σ", 1, "UNICODE_CI", "");
+ assertSubstringIndex("Σ", "ς", 1, "UNICODE_CI", "");
+ assertSubstringIndex("Σ", "Σ", 1, "UNICODE_CI", "");
+ // Surrogate pairs.
+ assertSubstringIndex("a🙃b🙃c", "a", 1, "UTF8_BINARY", "");
+ assertSubstringIndex("a🙃b🙃c", "a", 1, "UTF8_LCASE", "");
+ assertSubstringIndex("a🙃b🙃c", "a", 1, "UNICODE", "");
+ assertSubstringIndex("a🙃b🙃c", "a", 1, "UNICODE_CI", "");
+ assertSubstringIndex("a🙃b🙃c", "🙃", 1, "UTF8_BINARY", "a");
+ assertSubstringIndex("a🙃b🙃c", "🙃", 1, "UTF8_LCASE", "a");
+ assertSubstringIndex("a🙃b🙃c", "🙃", 1, "UNICODE", "a");
+ assertSubstringIndex("a🙃b🙃c", "🙃", 1, "UNICODE_CI", "a");
+ assertSubstringIndex("a🙃b🙃c", "b", 1, "UTF8_BINARY", "a🙃");
+ assertSubstringIndex("a🙃b🙃c", "b", 1, "UTF8_LCASE", "a🙃");
+ assertSubstringIndex("a🙃b🙃c", "b", 1, "UNICODE", "a🙃");
+ assertSubstringIndex("a🙃b🙃c", "b", 1, "UNICODE_CI", "a🙃");
+ assertSubstringIndex("a🙃b🙃c", "🙃", 2, "UTF8_BINARY", "a🙃b");
+ assertSubstringIndex("a🙃b🙃c", "🙃", 2, "UTF8_LCASE", "a🙃b");
+ assertSubstringIndex("a🙃b🙃c", "🙃", 2, "UNICODE", "a🙃b");
+ assertSubstringIndex("a🙃b🙃c", "🙃", 2, "UNICODE_CI", "a🙃b");
+ assertSubstringIndex("a🙃b🙃c", "c", 1, "UTF8_BINARY", "a🙃b🙃");
+ assertSubstringIndex("a🙃b🙃c", "c", 1, "UTF8_LCASE", "a🙃b🙃");
+ assertSubstringIndex("a🙃b🙃c", "c", 1, "UNICODE", "a🙃b🙃");
+ assertSubstringIndex("a🙃b🙃c", "c", 1, "UNICODE_CI", "a🙃b🙃");
+ assertSubstringIndex("a🙃b🙃c", "🙃", 3, "UTF8_BINARY", "a🙃b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "🙃", 3, "UTF8_LCASE", "a🙃b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "🙃", 3, "UNICODE", "a🙃b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "🙃", 3, "UNICODE_CI", "a🙃b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "d", 1, "UTF8_BINARY", "a🙃b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "d", 1, "UTF8_LCASE", "a🙃b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "d", 1, "UNICODE", "a🙃b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "d", 1, "UNICODE_CI", "a🙃b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "c", -1, "UTF8_BINARY", "");
+ assertSubstringIndex("a🙃b🙃c", "c", -1, "UTF8_LCASE", "");
+ assertSubstringIndex("a🙃b🙃c", "c", -1, "UNICODE", "");
+ assertSubstringIndex("a🙃b🙃c", "c", -1, "UNICODE_CI", "");
+ assertSubstringIndex("a🙃b🙃c", "🙃", -1, "UTF8_BINARY", "c");
+ assertSubstringIndex("a🙃b🙃c", "🙃", -1, "UTF8_LCASE", "c");
+ assertSubstringIndex("a🙃b🙃c", "🙃", -1, "UNICODE", "c");
+ assertSubstringIndex("a🙃b🙃c", "🙃", -1, "UNICODE_CI", "c");
+ assertSubstringIndex("a🙃b🙃c", "b", -1, "UTF8_BINARY", "🙃c");
+ assertSubstringIndex("a🙃b🙃c", "b", -1, "UTF8_LCASE", "🙃c");
+ assertSubstringIndex("a🙃b🙃c", "b", -1, "UNICODE", "🙃c");
+ assertSubstringIndex("a🙃b🙃c", "b", -1, "UNICODE_CI", "🙃c");
+ assertSubstringIndex("a🙃b🙃c", "🙃", -2, "UTF8_BINARY", "b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "🙃", -2, "UTF8_LCASE", "b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "🙃", -2, "UNICODE", "b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "🙃", -2, "UNICODE_CI", "b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "a", -1, "UTF8_BINARY", "🙃b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "a", -1, "UTF8_LCASE", "🙃b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "a", -1, "UNICODE", "🙃b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "a", -1, "UNICODE_CI", "🙃b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "🙃", -3, "UTF8_BINARY", "a🙃b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "🙃", -3, "UTF8_LCASE", "a🙃b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "🙃", -3, "UNICODE", "a🙃b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "🙃", -3, "UNICODE_CI", "a🙃b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "d", -1, "UTF8_BINARY", "a🙃b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "d", -1, "UTF8_LCASE", "a🙃b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "d", -1, "UNICODE", "a🙃b🙃c");
+ assertSubstringIndex("a🙃b🙃c", "d", -1, "UNICODE_CI", "a🙃b🙃c");
}
- private void assertStringTrim(
- String collation,
- String sourceString,
- String trimString,
- String expectedResultString) throws SparkException {
+ /**
+ * Verify the behaviour of the `StringTrim` collation support class.
+ */
+
+ private void assertStringTrim(String collationName, String sourceString, String trimString,
+ String expected) throws SparkException {
// Prepare the input and expected result.
- int collationId = CollationFactory.collationNameToId(collation);
+ int collationId = CollationFactory.collationNameToId(collationName);
UTF8String src = UTF8String.fromString(sourceString);
UTF8String trim = UTF8String.fromString(trimString);
- UTF8String resultTrimLeftRight, resultTrimRightLeft;
- String resultTrim;
+ UTF8String result, resultTrimLeftRight, resultTrimRightLeft;
if (trimString == null) {
// Trim string is ASCII space.
- resultTrim = CollationSupport.StringTrim.exec(src).toString();
+ result = CollationSupport.StringTrim.exec(src);
UTF8String trimLeft = CollationSupport.StringTrimLeft.exec(src);
resultTrimLeftRight = CollationSupport.StringTrimRight.exec(trimLeft);
UTF8String trimRight = CollationSupport.StringTrimRight.exec(src);
resultTrimRightLeft = CollationSupport.StringTrimLeft.exec(trimRight);
} else {
// Trim string is specified.
- resultTrim = CollationSupport.StringTrim.exec(src, trim, collationId).toString();
+ result = CollationSupport.StringTrim.exec(src, trim, collationId);
UTF8String trimLeft = CollationSupport.StringTrimLeft.exec(src, trim, collationId);
resultTrimLeftRight = CollationSupport.StringTrimRight.exec(trimLeft, trim, collationId);
UTF8String trimRight = CollationSupport.StringTrimRight.exec(src, trim, collationId);
@@ -1251,61 +2767,15 @@ private void assertStringTrim(
}
// Test that StringTrim result is as expected.
- assertEquals(expectedResultString, resultTrim);
+ assertEquals(UTF8String.fromString(expected), result);
// Test that the order of the trims is not important.
- assertEquals(resultTrimLeftRight.toString(), resultTrim);
- assertEquals(resultTrimRightLeft.toString(), resultTrim);
- }
-
- private void assertStringTrimLeft(
- String collation,
- String sourceString,
- String trimString,
- String expectedResultString) throws SparkException {
- // Prepare the input and expected result.
- int collationId = CollationFactory.collationNameToId(collation);
- UTF8String src = UTF8String.fromString(sourceString);
- UTF8String trim = UTF8String.fromString(trimString);
- String result;
-
- if (trimString == null) {
- // Trim string is ASCII space.
- result = CollationSupport.StringTrimLeft.exec(src).toString();
- } else {
- // Trim string is specified.
- result = CollationSupport.StringTrimLeft.exec(src, trim, collationId).toString();
- }
-
- // Test that StringTrimLeft result is as expected.
- assertEquals(expectedResultString, result);
- }
-
- private void assertStringTrimRight(
- String collation,
- String sourceString,
- String trimString,
- String expectedResultString) throws SparkException {
- // Prepare the input and expected result.
- int collationId = CollationFactory.collationNameToId(collation);
- UTF8String src = UTF8String.fromString(sourceString);
- UTF8String trim = UTF8String.fromString(trimString);
- String result;
-
- if (trimString == null) {
- // Trim string is ASCII space.
- result = CollationSupport.StringTrimRight.exec(src).toString();
- } else {
- // Trim string is specified.
- result = CollationSupport.StringTrimRight.exec(src, trim, collationId).toString();
- }
-
- // Test that StringTrimRight result is as expected.
- assertEquals(expectedResultString, result);
+ assertEquals(resultTrimLeftRight, result);
+ assertEquals(resultTrimRightLeft, result);
}
@Test
public void testStringTrim() throws SparkException {
- // Basic tests - UTF8_BINARY.
+ // Basic tests.
assertStringTrim("UTF8_BINARY", "", "", "");
assertStringTrim("UTF8_BINARY", "", "xyz", "");
assertStringTrim("UTF8_BINARY", "asd", "", "asd");
@@ -1315,25 +2785,6 @@ public void testStringTrim() throws SparkException {
assertStringTrim("UTF8_BINARY", "asd", "x", "asd");
assertStringTrim("UTF8_BINARY", "xxasdxx", "x", "asd");
assertStringTrim("UTF8_BINARY", "xa世ax", "x", "a世a");
- assertStringTrimLeft("UTF8_BINARY", "", "", "");
- assertStringTrimLeft("UTF8_BINARY", "", "xyz", "");
- assertStringTrimLeft("UTF8_BINARY", "asd", "", "asd");
- assertStringTrimLeft("UTF8_BINARY", "asd", null, "asd");
- assertStringTrimLeft("UTF8_BINARY", " asd ", null, "asd ");
- assertStringTrimLeft("UTF8_BINARY", " a世a ", null, "a世a ");
- assertStringTrimLeft("UTF8_BINARY", "asd", "x", "asd");
- assertStringTrimLeft("UTF8_BINARY", "xxasdxx", "x", "asdxx");
- assertStringTrimLeft("UTF8_BINARY", "xa世ax", "x", "a世ax");
- assertStringTrimRight("UTF8_BINARY", "", "", "");
- assertStringTrimRight("UTF8_BINARY", "", "xyz", "");
- assertStringTrimRight("UTF8_BINARY", "asd", "", "asd");
- assertStringTrimRight("UTF8_BINARY", "asd", null, "asd");
- assertStringTrimRight("UTF8_BINARY", " asd ", null, " asd");
- assertStringTrimRight("UTF8_BINARY", " a世a ", null, " a世a");
- assertStringTrimRight("UTF8_BINARY", "asd", "x", "asd");
- assertStringTrimRight("UTF8_BINARY", "xxasdxx", "x", "xxasd");
- assertStringTrimRight("UTF8_BINARY", "xa世ax", "x", "xa世a");
- // Basic tests - UTF8_LCASE.
assertStringTrim("UTF8_LCASE", "", "", "");
assertStringTrim("UTF8_LCASE", "", "xyz", "");
assertStringTrim("UTF8_LCASE", "asd", "", "asd");
@@ -1343,25 +2794,6 @@ public void testStringTrim() throws SparkException {
assertStringTrim("UTF8_LCASE", "asd", "x", "asd");
assertStringTrim("UTF8_LCASE", "xxasdxx", "x", "asd");
assertStringTrim("UTF8_LCASE", "xa世ax", "x", "a世a");
- assertStringTrimLeft("UTF8_LCASE", "", "", "");
- assertStringTrimLeft("UTF8_LCASE", "", "xyz", "");
- assertStringTrimLeft("UTF8_LCASE", "asd", "", "asd");
- assertStringTrimLeft("UTF8_LCASE", "asd", null, "asd");
- assertStringTrimLeft("UTF8_LCASE", " asd ", null, "asd ");
- assertStringTrimLeft("UTF8_LCASE", " a世a ", null, "a世a ");
- assertStringTrimLeft("UTF8_LCASE", "asd", "x", "asd");
- assertStringTrimLeft("UTF8_LCASE", "xxasdxx", "x", "asdxx");
- assertStringTrimLeft("UTF8_LCASE", "xa世ax", "x", "a世ax");
- assertStringTrimRight("UTF8_LCASE", "", "", "");
- assertStringTrimRight("UTF8_LCASE", "", "xyz", "");
- assertStringTrimRight("UTF8_LCASE", "asd", "", "asd");
- assertStringTrimRight("UTF8_LCASE", "asd", null, "asd");
- assertStringTrimRight("UTF8_LCASE", " asd ", null, " asd");
- assertStringTrimRight("UTF8_LCASE", " a世a ", null, " a世a");
- assertStringTrimRight("UTF8_LCASE", "asd", "x", "asd");
- assertStringTrimRight("UTF8_LCASE", "xxasdxx", "x", "xxasd");
- assertStringTrimRight("UTF8_LCASE", "xa世ax", "x", "xa世a");
- // Basic tests - UNICODE.
assertStringTrim("UNICODE", "", "", "");
assertStringTrim("UNICODE", "", "xyz", "");
assertStringTrim("UNICODE", "asd", "", "asd");
@@ -1371,25 +2803,6 @@ public void testStringTrim() throws SparkException {
assertStringTrim("UNICODE", "asd", "x", "asd");
assertStringTrim("UNICODE", "xxasdxx", "x", "asd");
assertStringTrim("UNICODE", "xa世ax", "x", "a世a");
- assertStringTrimLeft("UNICODE", "", "", "");
- assertStringTrimLeft("UNICODE", "", "xyz", "");
- assertStringTrimLeft("UNICODE", "asd", "", "asd");
- assertStringTrimLeft("UNICODE", "asd", null, "asd");
- assertStringTrimLeft("UNICODE", " asd ", null, "asd ");
- assertStringTrimLeft("UNICODE", " a世a ", null, "a世a ");
- assertStringTrimLeft("UNICODE", "asd", "x", "asd");
- assertStringTrimLeft("UNICODE", "xxasdxx", "x", "asdxx");
- assertStringTrimLeft("UNICODE", "xa世ax", "x", "a世ax");
- assertStringTrimRight("UNICODE", "", "", "");
- assertStringTrimRight("UNICODE", "", "xyz", "");
- assertStringTrimRight("UNICODE", "asd", "", "asd");
- assertStringTrimRight("UNICODE", "asd", null, "asd");
- assertStringTrimRight("UNICODE", " asd ", null, " asd");
- assertStringTrimRight("UNICODE", " a世a ", null, " a世a");
- assertStringTrimRight("UNICODE", "asd", "x", "asd");
- assertStringTrimRight("UNICODE", "xxasdxx", "x", "xxasd");
- assertStringTrimRight("UNICODE", "xa世ax", "x", "xa世a");
- // Basic tests - UNICODE_CI.
assertStringTrim("UNICODE_CI", "", "", "");
assertStringTrim("UNICODE_CI", "", "xyz", "");
assertStringTrim("UNICODE_CI", "asd", "", "asd");
@@ -1399,98 +2812,44 @@ public void testStringTrim() throws SparkException {
assertStringTrim("UNICODE_CI", "asd", "x", "asd");
assertStringTrim("UNICODE_CI", "xxasdxx", "x", "asd");
assertStringTrim("UNICODE_CI", "xa世ax", "x", "a世a");
- assertStringTrimLeft("UNICODE_CI", "", "", "");
- assertStringTrimLeft("UNICODE_CI", "", "xyz", "");
- assertStringTrimLeft("UNICODE_CI", "asd", "", "asd");
- assertStringTrimLeft("UNICODE_CI", "asd", null, "asd");
- assertStringTrimLeft("UNICODE_CI", " asd ", null, "asd ");
- assertStringTrimLeft("UNICODE_CI", " a世a ", null, "a世a ");
- assertStringTrimLeft("UNICODE_CI", "asd", "x", "asd");
- assertStringTrimLeft("UNICODE_CI", "xxasdxx", "x", "asdxx");
- assertStringTrimLeft("UNICODE_CI", "xa世ax", "x", "a世ax");
- assertStringTrimRight("UNICODE_CI", "", "", "");
- assertStringTrimRight("UNICODE_CI", "", "xyz", "");
- assertStringTrimRight("UNICODE_CI", "asd", "", "asd");
- assertStringTrimRight("UNICODE_CI", "asd", null, "asd");
- assertStringTrimRight("UNICODE_CI", " asd ", null, " asd");
- assertStringTrimRight("UNICODE_CI", " a世a ", null, " a世a");
- assertStringTrimRight("UNICODE_CI", "asd", "x", "asd");
- assertStringTrimRight("UNICODE_CI", "xxasdxx", "x", "xxasd");
- assertStringTrimRight("UNICODE_CI", "xa世ax", "x", "xa世a");
-
- // Case variation - UTF8_BINARY.
+ // Case variation.
assertStringTrim("UTF8_BINARY", "asd", "A", "asd");
assertStringTrim("UTF8_BINARY", "ddsXXXaa", "asd", "XXX");
assertStringTrim("UTF8_BINARY", "ASD", "a", "ASD");
- assertStringTrimLeft("UTF8_BINARY", "ddsXXXaa", "asd", "XXXaa");
- assertStringTrimRight("UTF8_BINARY", "ddsXXXaa", "asd", "ddsXXX");
- // Case variation - UTF8_LCASE.
assertStringTrim("UTF8_LCASE", "asd", "A", "sd");
assertStringTrim("UTF8_LCASE", "ASD", "a", "SD");
assertStringTrim("UTF8_LCASE", "ddsXXXaa", "ASD", "XXX");
- assertStringTrimLeft("UTF8_LCASE", "ddsXXXaa", "aSd", "XXXaa");
- assertStringTrimRight("UTF8_LCASE", "ddsXXXaa", "AsD", "ddsXXX");
- // Case variation - UNICODE.
assertStringTrim("UNICODE", "asd", "A", "asd");
assertStringTrim("UNICODE", "ASD", "a", "ASD");
assertStringTrim("UNICODE", "ddsXXXaa", "asd", "XXX");
- assertStringTrimLeft("UNICODE", "ddsXXXaa", "asd", "XXXaa");
- assertStringTrimRight("UNICODE", "ddsXXXaa", "asd", "ddsXXX");
- // Case variation - UNICODE_CI.
assertStringTrim("UNICODE_CI", "asd", "A", "sd");
assertStringTrim("UNICODE_CI", "ASD", "a", "SD");
assertStringTrim("UNICODE_CI", "ddsXXXaa", "ASD", "XXX");
- assertStringTrimLeft("UNICODE_CI", "ddsXXXaa", "aSd", "XXXaa");
- assertStringTrimRight("UNICODE_CI", "ddsXXXaa", "AsD", "ddsXXX");
-
- // Case-variable character length - UTF8_BINARY.
+ assertStringTrim("SR_CI_AI", "cSCšćČXXXsčšČŠsć", "čš", "XXX");
+ // One-to-many case mapping (e.g. Turkish dotted I)..
assertStringTrim("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ");
- assertStringTrimLeft("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ");
- assertStringTrimRight("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ");
assertStringTrim("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß");
- assertStringTrimLeft("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß");
- assertStringTrimRight("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß");
assertStringTrim("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaa");
- assertStringTrimLeft("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaaẞ");
- assertStringTrimRight("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "Ëaaa");
- // Case-variable character length - UTF8_LCASE.
assertStringTrim("UTF8_LCASE", "ẞaaaẞ", "ß", "aaa");
- assertStringTrimLeft("UTF8_LCASE", "ẞaaaẞ", "ß", "aaaẞ");
- assertStringTrimRight("UTF8_LCASE", "ẞaaaẞ", "ß", "ẞaaa");
assertStringTrim("UTF8_LCASE", "ßaaaß", "ẞ", "aaa");
- assertStringTrimLeft("UTF8_LCASE", "ßaaaß", "ẞ", "aaaß");
- assertStringTrimRight("UTF8_LCASE", "ßaaaß", "ẞ", "ßaaa");
assertStringTrim("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "aaa");
- assertStringTrimLeft("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "aaaẞ");
- assertStringTrimRight("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "Ëaaa");
- // Case-variable character length - UNICODE.
assertStringTrim("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ");
- assertStringTrimLeft("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ");
- assertStringTrimRight("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ");
assertStringTrim("UNICODE", "ßaaaß", "ẞ", "ßaaaß");
- assertStringTrimLeft("UNICODE", "ßaaaß", "ẞ", "ßaaaß");
- assertStringTrimRight("UNICODE", "ßaaaß", "ẞ", "ßaaaß");
assertStringTrim("UNICODE", "Ëaaaẞ", "Ëẞ", "aaa");
- assertStringTrimLeft("UNICODE", "Ëaaaẞ", "Ëẞ", "aaaẞ");
- assertStringTrimRight("UNICODE", "Ëaaaẞ", "Ëẞ", "Ëaaa");
- // Case-variable character length - UNICODE_CI.
assertStringTrim("UNICODE_CI", "ẞaaaẞ", "ß", "aaa");
- assertStringTrimLeft("UNICODE_CI", "ẞaaaẞ", "ß", "aaaẞ");
- assertStringTrimRight("UNICODE_CI", "ẞaaaẞ", "ß", "ẞaaa");
assertStringTrim("UNICODE_CI", "ßaaaß", "ẞ", "aaa");
- assertStringTrimLeft("UNICODE_CI", "ßaaaß", "ẞ", "aaaß");
- assertStringTrimRight("UNICODE_CI", "ßaaaß", "ẞ", "ßaaa");
assertStringTrim("UNICODE_CI", "Ëaaaẞ", "Ëẞ", "aaa");
- assertStringTrimLeft("UNICODE_CI", "Ëaaaẞ", "Ëẞ", "aaaẞ");
- assertStringTrimRight("UNICODE_CI", "Ëaaaẞ", "Ëẞ", "Ëaaa");
-
- // One-to-many case mapping - UTF8_BINARY.
+ // One-to-many case mapping (e.g. Turkish dotted I).
assertStringTrim("UTF8_BINARY", "i", "i", "");
assertStringTrim("UTF8_BINARY", "iii", "I", "iii");
assertStringTrim("UTF8_BINARY", "I", "iii", "I");
assertStringTrim("UTF8_BINARY", "ixi", "i", "x");
assertStringTrim("UTF8_BINARY", "i", "İ", "i");
assertStringTrim("UTF8_BINARY", "i\u0307", "İ", "i\u0307");
+ assertStringTrim("UTF8_BINARY", "ii\u0307", "İi", "\u0307");
+ assertStringTrim("UTF8_BINARY", "iii\u0307", "İi", "\u0307");
+ assertStringTrim("UTF8_BINARY", "iiii\u0307", "iİ", "\u0307");
+ assertStringTrim("UTF8_BINARY", "ii\u0307ii\u0307", "iİ", "\u0307ii\u0307");
assertStringTrim("UTF8_BINARY", "i\u0307", "i", "\u0307");
assertStringTrim("UTF8_BINARY", "i\u0307", "\u0307", "i");
assertStringTrim("UTF8_BINARY", "i\u0307", "i\u0307", "");
@@ -1510,63 +2869,16 @@ public void testStringTrim() throws SparkException {
assertStringTrim("UTF8_BINARY", "Ixİ", "i\u0307", "Ixİ");
assertStringTrim("UTF8_BINARY", "IXİ", "ix\u0307", "IXİ");
assertStringTrim("UTF8_BINARY", "xi\u0307", "\u0307IX", "xi");
- assertStringTrimLeft("UTF8_BINARY", "i", "i", "");
- assertStringTrimLeft("UTF8_BINARY", "iii", "I", "iii");
- assertStringTrimLeft("UTF8_BINARY", "I", "iii", "I");
- assertStringTrimLeft("UTF8_BINARY", "ixi", "i", "xi");
- assertStringTrimLeft("UTF8_BINARY", "i", "İ", "i");
- assertStringTrimLeft("UTF8_BINARY", "i\u0307", "İ", "i\u0307");
- assertStringTrimLeft("UTF8_BINARY", "i\u0307", "i", "\u0307");
- assertStringTrimLeft("UTF8_BINARY", "i\u0307", "\u0307", "i\u0307");
- assertStringTrimLeft("UTF8_BINARY", "i\u0307", "i\u0307", "");
- assertStringTrimLeft("UTF8_BINARY", "i\u0307i\u0307", "i\u0307", "");
- assertStringTrimLeft("UTF8_BINARY", "i\u0307\u0307", "i\u0307", "");
- assertStringTrimLeft("UTF8_BINARY", "i\u0307i", "i\u0307", "");
- assertStringTrimLeft("UTF8_BINARY", "i\u0307i", "İ", "i\u0307i");
- assertStringTrimLeft("UTF8_BINARY", "i\u0307İ", "i\u0307", "İ");
- assertStringTrimLeft("UTF8_BINARY", "i\u0307İ", "İ", "i\u0307İ");
- assertStringTrimLeft("UTF8_BINARY", "İ", "İ", "");
- assertStringTrimLeft("UTF8_BINARY", "IXi", "İ", "IXi");
- assertStringTrimLeft("UTF8_BINARY", "ix\u0307", "Ixİ", "ix\u0307");
- assertStringTrimLeft("UTF8_BINARY", "i\u0307x", "IXİ", "i\u0307x");
- assertStringTrimLeft("UTF8_BINARY", "i\u0307x", "ix\u0307İ", "");
- assertStringTrimLeft("UTF8_BINARY", "İ", "i", "İ");
- assertStringTrimLeft("UTF8_BINARY", "İ", "\u0307", "İ");
- assertStringTrimLeft("UTF8_BINARY", "Ixİ", "i\u0307", "Ixİ");
- assertStringTrimLeft("UTF8_BINARY", "IXİ", "ix\u0307", "IXİ");
- assertStringTrimLeft("UTF8_BINARY", "xi\u0307", "\u0307IX", "xi\u0307");
- assertStringTrimRight("UTF8_BINARY", "i", "i", "");
- assertStringTrimRight("UTF8_BINARY", "iii", "I", "iii");
- assertStringTrimRight("UTF8_BINARY", "I", "iii", "I");
- assertStringTrimRight("UTF8_BINARY", "ixi", "i", "ix");
- assertStringTrimRight("UTF8_BINARY", "i", "İ", "i");
- assertStringTrimRight("UTF8_BINARY", "i\u0307", "İ", "i\u0307");
- assertStringTrimRight("UTF8_BINARY", "i\u0307", "i", "i\u0307");
- assertStringTrimRight("UTF8_BINARY", "i\u0307", "\u0307", "i");
- assertStringTrimRight("UTF8_BINARY", "i\u0307", "i\u0307", "");
- assertStringTrimRight("UTF8_BINARY", "i\u0307i\u0307", "i\u0307", "");
- assertStringTrimRight("UTF8_BINARY", "i\u0307\u0307", "i\u0307", "");
- assertStringTrimRight("UTF8_BINARY", "i\u0307i", "i\u0307", "");
- assertStringTrimRight("UTF8_BINARY", "i\u0307i", "İ", "i\u0307i");
- assertStringTrimRight("UTF8_BINARY", "i\u0307İ", "i\u0307", "i\u0307İ");
- assertStringTrimRight("UTF8_BINARY", "i\u0307İ", "İ", "i\u0307");
- assertStringTrimRight("UTF8_BINARY", "İ", "İ", "");
- assertStringTrimRight("UTF8_BINARY", "IXi", "İ", "IXi");
- assertStringTrimRight("UTF8_BINARY", "ix\u0307", "Ixİ", "ix\u0307");
- assertStringTrimRight("UTF8_BINARY", "i\u0307x", "IXİ", "i\u0307x");
- assertStringTrimRight("UTF8_BINARY", "i\u0307x", "ix\u0307İ", "");
- assertStringTrimRight("UTF8_BINARY", "İ", "i", "İ");
- assertStringTrimRight("UTF8_BINARY", "İ", "\u0307", "İ");
- assertStringTrimRight("UTF8_BINARY", "Ixİ", "i\u0307", "Ixİ");
- assertStringTrimRight("UTF8_BINARY", "IXİ", "ix\u0307", "IXİ");
- assertStringTrimRight("UTF8_BINARY", "xi\u0307", "\u0307IX", "xi");
- // One-to-many case mapping - UTF8_LCASE.
assertStringTrim("UTF8_LCASE", "i", "i", "");
assertStringTrim("UTF8_LCASE", "iii", "I", "");
assertStringTrim("UTF8_LCASE", "I", "iii", "");
assertStringTrim("UTF8_LCASE", "ixi", "i", "x");
assertStringTrim("UTF8_LCASE", "i", "İ", "i");
assertStringTrim("UTF8_LCASE", "i\u0307", "İ", "");
+ assertStringTrim("UTF8_LCASE", "ii\u0307", "İi", "");
+ assertStringTrim("UTF8_LCASE", "iii\u0307", "İi", "");
+ assertStringTrim("UTF8_LCASE", "iiii\u0307", "iİ", "");
+ assertStringTrim("UTF8_LCASE", "ii\u0307ii\u0307", "iİ", "");
assertStringTrim("UTF8_LCASE", "i\u0307", "i", "\u0307");
assertStringTrim("UTF8_LCASE", "i\u0307", "\u0307", "i");
assertStringTrim("UTF8_LCASE", "i\u0307", "i\u0307", "");
@@ -1586,63 +2898,16 @@ public void testStringTrim() throws SparkException {
assertStringTrim("UTF8_LCASE", "Ixİ", "i\u0307", "xİ");
assertStringTrim("UTF8_LCASE", "IXİ", "ix\u0307", "İ");
assertStringTrim("UTF8_LCASE", "xi\u0307", "\u0307IX", "");
- assertStringTrimLeft("UTF8_LCASE", "i", "i", "");
- assertStringTrimLeft("UTF8_LCASE", "iii", "I", "");
- assertStringTrimLeft("UTF8_LCASE", "I", "iii", "");
- assertStringTrimLeft("UTF8_LCASE", "ixi", "i", "xi");
- assertStringTrimLeft("UTF8_LCASE", "i", "İ", "i");
- assertStringTrimLeft("UTF8_LCASE", "i\u0307", "İ", "");
- assertStringTrimLeft("UTF8_LCASE", "i\u0307", "i", "\u0307");
- assertStringTrimLeft("UTF8_LCASE", "i\u0307", "\u0307", "i\u0307");
- assertStringTrimLeft("UTF8_LCASE", "i\u0307", "i\u0307", "");
- assertStringTrimLeft("UTF8_LCASE", "i\u0307i\u0307", "i\u0307", "");
- assertStringTrimLeft("UTF8_LCASE", "i\u0307\u0307", "i\u0307", "");
- assertStringTrimLeft("UTF8_LCASE", "i\u0307i", "i\u0307", "");
- assertStringTrimLeft("UTF8_LCASE", "i\u0307i", "İ", "i");
- assertStringTrimLeft("UTF8_LCASE", "i\u0307İ", "i\u0307", "İ");
- assertStringTrimLeft("UTF8_LCASE", "i\u0307İ", "İ", "");
- assertStringTrimLeft("UTF8_LCASE", "İ", "İ", "");
- assertStringTrimLeft("UTF8_LCASE", "IXi", "İ", "IXi");
- assertStringTrimLeft("UTF8_LCASE", "ix\u0307", "Ixİ", "\u0307");
- assertStringTrimLeft("UTF8_LCASE", "i\u0307x", "IXİ", "");
- assertStringTrimLeft("UTF8_LCASE", "i\u0307x", "I\u0307xİ", "");
- assertStringTrimLeft("UTF8_LCASE", "İ", "i", "İ");
- assertStringTrimLeft("UTF8_LCASE", "İ", "\u0307", "İ");
- assertStringTrimLeft("UTF8_LCASE", "Ixİ", "i\u0307", "xİ");
- assertStringTrimLeft("UTF8_LCASE", "IXİ", "ix\u0307", "İ");
- assertStringTrimLeft("UTF8_LCASE", "xi\u0307", "\u0307IX", "");
- assertStringTrimRight("UTF8_LCASE", "i", "i", "");
- assertStringTrimRight("UTF8_LCASE", "iii", "I", "");
- assertStringTrimRight("UTF8_LCASE", "I", "iii", "");
- assertStringTrimRight("UTF8_LCASE", "ixi", "i", "ix");
- assertStringTrimRight("UTF8_LCASE", "i", "İ", "i");
- assertStringTrimRight("UTF8_LCASE", "i\u0307", "İ", "");
- assertStringTrimRight("UTF8_LCASE", "i\u0307", "i", "i\u0307");
- assertStringTrimRight("UTF8_LCASE", "i\u0307", "\u0307", "i");
- assertStringTrimRight("UTF8_LCASE", "i\u0307", "i\u0307", "");
- assertStringTrimRight("UTF8_LCASE", "i\u0307i\u0307", "i\u0307", "");
- assertStringTrimRight("UTF8_LCASE", "i\u0307\u0307", "i\u0307", "");
- assertStringTrimRight("UTF8_LCASE", "i\u0307i", "i\u0307", "");
- assertStringTrimRight("UTF8_LCASE", "i\u0307i", "İ", "i\u0307i");
- assertStringTrimRight("UTF8_LCASE", "i\u0307İ", "i\u0307", "i\u0307İ");
- assertStringTrimRight("UTF8_LCASE", "i\u0307İ", "İ", "");
- assertStringTrimRight("UTF8_LCASE", "İ", "İ", "");
- assertStringTrimRight("UTF8_LCASE", "IXi", "İ", "IXi");
- assertStringTrimRight("UTF8_LCASE", "ix\u0307", "Ixİ", "ix\u0307");
- assertStringTrimRight("UTF8_LCASE", "i\u0307x", "IXİ", "");
- assertStringTrimRight("UTF8_LCASE", "i\u0307x", "I\u0307xİ", "");
- assertStringTrimRight("UTF8_LCASE", "İ", "i", "İ");
- assertStringTrimRight("UTF8_LCASE", "İ", "\u0307", "İ");
- assertStringTrimRight("UTF8_LCASE", "Ixİ", "i\u0307", "Ixİ");
- assertStringTrimRight("UTF8_LCASE", "IXİ", "ix\u0307", "IXİ");
- assertStringTrimRight("UTF8_LCASE", "xi\u0307", "\u0307IX", "");
- // One-to-many case mapping - UNICODE.
assertStringTrim("UNICODE", "i", "i", "");
assertStringTrim("UNICODE", "iii", "I", "iii");
assertStringTrim("UNICODE", "I", "iii", "I");
assertStringTrim("UNICODE", "ixi", "i", "x");
assertStringTrim("UNICODE", "i", "İ", "i");
assertStringTrim("UNICODE", "i\u0307", "İ", "i\u0307");
+ assertStringTrim("UNICODE", "ii\u0307", "İi", "i\u0307");
+ assertStringTrim("UNICODE", "iii\u0307", "İi", "i\u0307");
+ assertStringTrim("UNICODE", "iiii\u0307", "iİ", "i\u0307");
+ assertStringTrim("UNICODE", "ii\u0307ii\u0307", "iİ", "i\u0307ii\u0307");
assertStringTrim("UNICODE", "i\u0307", "i", "i\u0307");
assertStringTrim("UNICODE", "i\u0307", "\u0307", "i\u0307");
assertStringTrim("UNICODE", "i\u0307", "i\u0307", "i\u0307");
@@ -1663,65 +2928,16 @@ public void testStringTrim() throws SparkException {
assertStringTrim("UNICODE", "Ixİ", "i\u0307", "Ixİ");
assertStringTrim("UNICODE", "IXİ", "ix\u0307", "IXİ");
assertStringTrim("UNICODE", "xi\u0307", "\u0307IX", "xi\u0307");
- assertStringTrimLeft("UNICODE", "i", "i", "");
- assertStringTrimLeft("UNICODE", "iii", "I", "iii");
- assertStringTrimLeft("UNICODE", "I", "iii", "I");
- assertStringTrimLeft("UNICODE", "ixi", "i", "xi");
- assertStringTrimLeft("UNICODE", "i", "İ", "i");
- assertStringTrimLeft("UNICODE", "i\u0307", "İ", "i\u0307");
- assertStringTrimLeft("UNICODE", "i\u0307", "i", "i\u0307");
- assertStringTrimLeft("UNICODE", "i\u0307", "\u0307", "i\u0307");
- assertStringTrimLeft("UNICODE", "i\u0307", "i\u0307", "i\u0307");
- assertStringTrimLeft("UNICODE", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307");
- assertStringTrimLeft("UNICODE", "i\u0307\u0307", "i\u0307", "i\u0307\u0307");
- assertStringTrimLeft("UNICODE", "i\u0307i", "i\u0307", "i\u0307i");
- assertStringTrimLeft("UNICODE", "i\u0307i", "İ", "i\u0307i");
- assertStringTrimLeft("UNICODE", "i\u0307İ", "i\u0307", "i\u0307İ");
- assertStringTrimLeft("UNICODE", "i\u0307İ", "İ", "i\u0307İ");
- assertStringTrimLeft("UNICODE", "İ", "İ", "");
- assertStringTrimLeft("UNICODE", "IXi", "İ", "IXi");
- assertStringTrimLeft("UNICODE", "ix\u0307", "Ixİ", "ix\u0307");
- assertStringTrimLeft("UNICODE", "i\u0307x", "IXİ", "i\u0307x");
- assertStringTrimLeft("UNICODE", "i\u0307x", "ix\u0307İ", "i\u0307x");
- assertStringTrimLeft("UNICODE", "İ", "i", "İ");
- assertStringTrimLeft("UNICODE", "İ", "\u0307", "İ");
- assertStringTrimLeft("UNICODE", "i\u0307", "i\u0307", "i\u0307");
- assertStringTrimLeft("UNICODE", "Ixİ", "i\u0307", "Ixİ");
- assertStringTrimLeft("UNICODE", "IXİ", "ix\u0307", "IXİ");
- assertStringTrimLeft("UNICODE", "xi\u0307", "\u0307IX", "xi\u0307");
- assertStringTrimRight("UNICODE", "i", "i", "");
- assertStringTrimRight("UNICODE", "iii", "I", "iii");
- assertStringTrimRight("UNICODE", "I", "iii", "I");
- assertStringTrimRight("UNICODE", "ixi", "i", "ix");
- assertStringTrimRight("UNICODE", "i", "İ", "i");
- assertStringTrimRight("UNICODE", "i\u0307", "İ", "i\u0307");
- assertStringTrimRight("UNICODE", "i\u0307", "i", "i\u0307");
- assertStringTrimRight("UNICODE", "i\u0307", "\u0307", "i\u0307");
- assertStringTrimRight("UNICODE", "i\u0307", "i\u0307", "i\u0307");
- assertStringTrimRight("UNICODE", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307");
- assertStringTrimRight("UNICODE", "i\u0307\u0307", "i\u0307", "i\u0307\u0307");
- assertStringTrimRight("UNICODE", "i\u0307i", "i\u0307", "i\u0307");
- assertStringTrimRight("UNICODE", "i\u0307i", "İ", "i\u0307i");
- assertStringTrimRight("UNICODE", "i\u0307İ", "i\u0307", "i\u0307İ");
- assertStringTrimRight("UNICODE", "i\u0307İ", "İ", "i\u0307");
- assertStringTrimRight("UNICODE", "İ", "İ", "");
- assertStringTrimRight("UNICODE", "IXi", "İ", "IXi");
- assertStringTrimRight("UNICODE", "ix\u0307", "Ixİ", "ix\u0307");
- assertStringTrimRight("UNICODE", "i\u0307x", "IXİ", "i\u0307x");
- assertStringTrimRight("UNICODE", "i\u0307x", "ix\u0307İ", "i\u0307");
- assertStringTrimRight("UNICODE", "İ", "i", "İ");
- assertStringTrimRight("UNICODE", "İ", "\u0307", "İ");
- assertStringTrimRight("UNICODE", "i\u0307", "i\u0307", "i\u0307");
- assertStringTrimRight("UNICODE", "Ixİ", "i\u0307", "Ixİ");
- assertStringTrimRight("UNICODE", "IXİ", "ix\u0307", "IXİ");
- assertStringTrimRight("UNICODE", "xi\u0307", "\u0307IX", "xi\u0307");
- // One-to-many case mapping - UNICODE_CI.
assertStringTrim("UNICODE_CI", "i", "i", "");
assertStringTrim("UNICODE_CI", "iii", "I", "");
assertStringTrim("UNICODE_CI", "I", "iii", "");
assertStringTrim("UNICODE_CI", "ixi", "i", "x");
assertStringTrim("UNICODE_CI", "i", "İ", "i");
assertStringTrim("UNICODE_CI", "i\u0307", "İ", "");
+ assertStringTrim("UNICODE_CI", "ii\u0307", "İi", "");
+ assertStringTrim("UNICODE_CI", "iii\u0307", "İi", "");
+ assertStringTrim("UNICODE_CI", "iiii\u0307", "iİ", "");
+ assertStringTrim("UNICODE_CI", "ii\u0307ii\u0307", "iİ", "");
assertStringTrim("UNICODE_CI", "i\u0307", "i", "i\u0307");
assertStringTrim("UNICODE_CI", "i\u0307", "\u0307", "i\u0307");
assertStringTrim("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307");
@@ -1742,12 +2958,282 @@ public void testStringTrim() throws SparkException {
assertStringTrim("UNICODE_CI", "Ixİ", "i\u0307", "xİ");
assertStringTrim("UNICODE_CI", "IXİ", "ix\u0307", "İ");
assertStringTrim("UNICODE_CI", "xi\u0307", "\u0307IX", "i\u0307");
+ // Conditional case mapping (e.g. Greek sigmas).
+ assertStringTrim("UTF8_BINARY", "ςxς", "σ", "ςxς");
+ assertStringTrim("UTF8_BINARY", "ςxς", "ς", "x");
+ assertStringTrim("UTF8_BINARY", "ςxς", "Σ", "ςxς");
+ assertStringTrim("UTF8_BINARY", "σxσ", "σ", "x");
+ assertStringTrim("UTF8_BINARY", "σxσ", "ς", "σxσ");
+ assertStringTrim("UTF8_BINARY", "σxσ", "Σ", "σxσ");
+ assertStringTrim("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ");
+ assertStringTrim("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ");
+ assertStringTrim("UTF8_BINARY", "ΣxΣ", "Σ", "x");
+ assertStringTrim("UTF8_LCASE", "ςxς", "σ", "x");
+ assertStringTrim("UTF8_LCASE", "ςxς", "ς", "x");
+ assertStringTrim("UTF8_LCASE", "ςxς", "Σ", "x");
+ assertStringTrim("UTF8_LCASE", "σxσ", "σ", "x");
+ assertStringTrim("UTF8_LCASE", "σxσ", "ς", "x");
+ assertStringTrim("UTF8_LCASE", "σxσ", "Σ", "x");
+ assertStringTrim("UTF8_LCASE", "ΣxΣ", "σ", "x");
+ assertStringTrim("UTF8_LCASE", "ΣxΣ", "ς", "x");
+ assertStringTrim("UTF8_LCASE", "ΣxΣ", "Σ", "x");
+ assertStringTrim("UNICODE", "ςxς", "σ", "ςxς");
+ assertStringTrim("UNICODE", "ςxς", "ς", "x");
+ assertStringTrim("UNICODE", "ςxς", "Σ", "ςxς");
+ assertStringTrim("UNICODE", "σxσ", "σ", "x");
+ assertStringTrim("UNICODE", "σxσ", "ς", "σxσ");
+ assertStringTrim("UNICODE", "σxσ", "Σ", "σxσ");
+ assertStringTrim("UNICODE", "ΣxΣ", "σ", "ΣxΣ");
+ assertStringTrim("UNICODE", "ΣxΣ", "ς", "ΣxΣ");
+ assertStringTrim("UNICODE", "ΣxΣ", "Σ", "x");
+ assertStringTrim("UNICODE_CI", "ςxς", "σ", "x");
+ assertStringTrim("UNICODE_CI", "ςxς", "ς", "x");
+ assertStringTrim("UNICODE_CI", "ςxς", "Σ", "x");
+ assertStringTrim("UNICODE_CI", "σxσ", "σ", "x");
+ assertStringTrim("UNICODE_CI", "σxσ", "ς", "x");
+ assertStringTrim("UNICODE_CI", "σxσ", "Σ", "x");
+ assertStringTrim("UNICODE_CI", "ΣxΣ", "σ", "x");
+ assertStringTrim("UNICODE_CI", "ΣxΣ", "ς", "x");
+ assertStringTrim("UNICODE_CI", "ΣxΣ", "Σ", "x");
+ // Unicode normalization.
+ assertStringTrim("UTF8_BINARY", "åβγδa\u030A", "å", "βγδa\u030A");
+ assertStringTrim("UTF8_LCASE", "åβγδa\u030A", "Å", "βγδa\u030A");
+ assertStringTrim("UNICODE", "åβγδa\u030A", "å", "βγδ");
+ assertStringTrim("UNICODE_CI", "åβγδa\u030A", "Å", "βγδ");
+ // Surrogate pairs.
+ assertStringTrim("UTF8_BINARY", "a🙃b🙃c", "🙃", "a🙃b🙃c");
+ assertStringTrim("UTF8_LCASE", "a🙃b🙃c", "🙃", "a🙃b🙃c");
+ assertStringTrim("UNICODE", "a🙃b🙃c", "🙃", "a🙃b🙃c");
+ assertStringTrim("UNICODE_CI", "a🙃b🙃c", "🙃", "a🙃b🙃c");
+ assertStringTrim("UTF8_BINARY", "a🙃b🙃c", "ac", "🙃b🙃");
+ assertStringTrim("UTF8_LCASE", "a🙃b🙃c", "ac", "🙃b🙃");
+ assertStringTrim("UNICODE", "a🙃b🙃c", "ac", "🙃b🙃");
+ assertStringTrim("UNICODE_CI", "a🙃b🙃c", "ac", "🙃b🙃");
+ assertStringTrim("UTF8_BINARY", "a🙃b🙃c", "a🙃c", "b");
+ assertStringTrim("UTF8_LCASE", "a🙃b🙃c", "a🙃c", "b");
+ assertStringTrim("UNICODE", "a🙃b🙃c", "a🙃c", "b");
+ assertStringTrim("UNICODE_CI", "a🙃b🙃c", "a🙃c", "b");
+ assertStringTrim("UTF8_BINARY", "a🙃b🙃c", "abc🙃", "");
+ assertStringTrim("UTF8_LCASE", "a🙃b🙃c", "abc🙃", "");
+ assertStringTrim("UNICODE", "a🙃b🙃c", "abc🙃", "");
+ assertStringTrim("UNICODE_CI", "a🙃b🙃c", "abc🙃", "");
+ assertStringTrim("UTF8_BINARY", "😀😆😃😄", "😀😄", "😆😃");
+ assertStringTrim("UTF8_LCASE", "😀😆😃😄", "😀😄", "😆😃");
+ assertStringTrim("UNICODE", "😀😆😃😄", "😀😄", "😆😃");
+ assertStringTrim("UNICODE_CI", "😀😆😃😄", "😀😄", "😆😃");
+ assertStringTrim("UTF8_BINARY", "😀😆😃😄", "😃😄", "😀😆");
+ assertStringTrim("UTF8_LCASE", "😀😆😃😄", "😃😄", "😀😆");
+ assertStringTrim("UNICODE", "😀😆😃😄", "😃😄", "😀😆");
+ assertStringTrim("UNICODE_CI", "😀😆😃😄", "😃😄", "😀😆");
+ assertStringTrim("UTF8_BINARY", "😀😆😃😄", "😀😆😃😄", "");
+ assertStringTrim("UTF8_LCASE", "😀😆😃😄", "😀😆😃😄", "");
+ assertStringTrim("UNICODE", "😀😆😃😄", "😀😆😃😄", "");
+ assertStringTrim("UNICODE_CI", "😀😆😃😄", "😀😆😃😄", "");
+ assertStringTrim("UTF8_BINARY", "𐐅", "𐐅", "");
+ assertStringTrim("UTF8_LCASE", "𐐅", "𐐅", "");
+ assertStringTrim("UNICODE", "𐐅", "𐐅", "");
+ assertStringTrim("UNICODE_CI", "𐐅", "𐐅", "");
+ assertStringTrim("UTF8_BINARY", "𐐅", "𐐭", "𐐅");
+ assertStringTrim("UTF8_LCASE", "𐐅", "𐐭", "");
+ assertStringTrim("UNICODE", "𐐅", "𐐭", "𐐅");
+ assertStringTrim("UNICODE_CI", "𐐅", "𐐭", "");
+ assertStringTrim("UTF8_BINARY", "𝔸", "𝔸", "");
+ assertStringTrim("UTF8_LCASE", "𝔸", "𝔸", "");
+ assertStringTrim("UNICODE", "𝔸", "𝔸", "");
+ assertStringTrim("UNICODE_CI", "𝔸", "𝔸", "");
+ assertStringTrim("UTF8_BINARY", "𝔸", "A", "𝔸");
+ assertStringTrim("UTF8_LCASE", "𝔸", "A", "𝔸");
+ assertStringTrim("UNICODE", "𝔸", "A", "𝔸");
+ assertStringTrim("UNICODE_CI", "𝔸", "A", "");
+ assertStringTrim("UTF8_BINARY", "𝔸", "a", "𝔸");
+ assertStringTrim("UTF8_LCASE", "𝔸", "a", "𝔸");
+ assertStringTrim("UNICODE", "𝔸", "a", "𝔸");
+ assertStringTrim("UNICODE_CI", "𝔸", "a", "");
+ }
+
+ /**
+ * Verify the behaviour of the `StringTrimLeft` collation support class.
+ */
+
+ private void assertStringTrimLeft(String collationName, String sourceString, String trimString,
+ String expected) throws SparkException {
+ // Prepare the input and expected result.
+ int collationId = CollationFactory.collationNameToId(collationName);
+ UTF8String src = UTF8String.fromString(sourceString);
+ UTF8String trim = UTF8String.fromString(trimString);
+ UTF8String result;
+
+ if (trimString == null) {
+ // Trim string is ASCII space.
+ result = CollationSupport.StringTrimLeft.exec(src);
+ } else {
+ // Trim string is specified.
+ result = CollationSupport.StringTrimLeft.exec(src, trim, collationId);
+ }
+
+ // Test that StringTrimLeft result is as expected.
+ assertEquals(UTF8String.fromString(expected), result);
+ }
+
+ @Test
+ public void testStringTrimLeft() throws SparkException {
+ // Basic tests - UTF8_BINARY.
+ assertStringTrimLeft("UTF8_BINARY", "", "", "");
+ assertStringTrimLeft("UTF8_BINARY", "", "xyz", "");
+ assertStringTrimLeft("UTF8_BINARY", "asd", "", "asd");
+ assertStringTrimLeft("UTF8_BINARY", "asd", null, "asd");
+ assertStringTrimLeft("UTF8_BINARY", " asd ", null, "asd ");
+ assertStringTrimLeft("UTF8_BINARY", " a世a ", null, "a世a ");
+ assertStringTrimLeft("UTF8_BINARY", "asd", "x", "asd");
+ assertStringTrimLeft("UTF8_BINARY", "xxasdxx", "x", "asdxx");
+ assertStringTrimLeft("UTF8_BINARY", "xa世ax", "x", "a世ax");
+ // Basic tests - UTF8_LCASE.
+ assertStringTrimLeft("UTF8_LCASE", "", "", "");
+ assertStringTrimLeft("UTF8_LCASE", "", "xyz", "");
+ assertStringTrimLeft("UTF8_LCASE", "asd", "", "asd");
+ assertStringTrimLeft("UTF8_LCASE", "asd", null, "asd");
+ assertStringTrimLeft("UTF8_LCASE", " asd ", null, "asd ");
+ assertStringTrimLeft("UTF8_LCASE", " a世a ", null, "a世a ");
+ assertStringTrimLeft("UTF8_LCASE", "asd", "x", "asd");
+ assertStringTrimLeft("UTF8_LCASE", "xxasdxx", "x", "asdxx");
+ assertStringTrimLeft("UTF8_LCASE", "xa世ax", "x", "a世ax");
+ // Basic tests - UNICODE.
+ assertStringTrimLeft("UNICODE", "", "", "");
+ assertStringTrimLeft("UNICODE", "", "xyz", "");
+ assertStringTrimLeft("UNICODE", "asd", "", "asd");
+ assertStringTrimLeft("UNICODE", "asd", null, "asd");
+ assertStringTrimLeft("UNICODE", " asd ", null, "asd ");
+ assertStringTrimLeft("UNICODE", " a世a ", null, "a世a ");
+ assertStringTrimLeft("UNICODE", "asd", "x", "asd");
+ assertStringTrimLeft("UNICODE", "xxasdxx", "x", "asdxx");
+ assertStringTrimLeft("UNICODE", "xa世ax", "x", "a世ax");
+ // Basic tests - UNICODE_CI.
+ assertStringTrimLeft("UNICODE_CI", "", "", "");
+ assertStringTrimLeft("UNICODE_CI", "", "xyz", "");
+ assertStringTrimLeft("UNICODE_CI", "asd", "", "asd");
+ assertStringTrimLeft("UNICODE_CI", "asd", null, "asd");
+ assertStringTrimLeft("UNICODE_CI", " asd ", null, "asd ");
+ assertStringTrimLeft("UNICODE_CI", " a世a ", null, "a世a ");
+ assertStringTrimLeft("UNICODE_CI", "asd", "x", "asd");
+ assertStringTrimLeft("UNICODE_CI", "xxasdxx", "x", "asdxx");
+ assertStringTrimLeft("UNICODE_CI", "xa世ax", "x", "a世ax");
+ // Case variation.
+ assertStringTrimLeft("UTF8_BINARY", "ddsXXXaa", "asd", "XXXaa");
+ assertStringTrimLeft("UTF8_LCASE", "ddsXXXaa", "aSd", "XXXaa");
+ assertStringTrimLeft("UNICODE", "ddsXXXaa", "asd", "XXXaa");
+ assertStringTrimLeft("UNICODE_CI", "ddsXXXaa", "aSd", "XXXaa");
+ // One-to-many case mapping (e.g. Turkish dotted I)..
+ assertStringTrimLeft("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ");
+ assertStringTrimLeft("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß");
+ assertStringTrimLeft("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaaẞ");
+ assertStringTrimLeft("UTF8_LCASE", "ẞaaaẞ", "ß", "aaaẞ");
+ assertStringTrimLeft("UTF8_LCASE", "ßaaaß", "ẞ", "aaaß");
+ assertStringTrimLeft("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "aaaẞ");
+ assertStringTrimLeft("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ");
+ assertStringTrimLeft("UNICODE", "ßaaaß", "ẞ", "ßaaaß");
+ assertStringTrimLeft("UNICODE", "Ëaaaẞ", "Ëẞ", "aaaẞ");
+ assertStringTrimLeft("UNICODE_CI", "ẞaaaẞ", "ß", "aaaẞ");
+ assertStringTrimLeft("UNICODE_CI", "ßaaaß", "ẞ", "aaaß");
+ assertStringTrimLeft("UNICODE_CI", "Ëaaaẞ", "Ëẞ", "aaaẞ");
+ // One-to-many case mapping (e.g. Turkish dotted I).
+ assertStringTrimLeft("UTF8_BINARY", "i", "i", "");
+ assertStringTrimLeft("UTF8_BINARY", "iii", "I", "iii");
+ assertStringTrimLeft("UTF8_BINARY", "I", "iii", "I");
+ assertStringTrimLeft("UTF8_BINARY", "ixi", "i", "xi");
+ assertStringTrimLeft("UTF8_BINARY", "i", "İ", "i");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307", "İ", "i\u0307");
+ assertStringTrimLeft("UTF8_BINARY", "ii\u0307", "İi", "\u0307");
+ assertStringTrimLeft("UTF8_BINARY", "iii\u0307", "İi", "\u0307");
+ assertStringTrimLeft("UTF8_BINARY", "iiii\u0307", "iİ", "\u0307");
+ assertStringTrimLeft("UTF8_BINARY", "ii\u0307ii\u0307", "iİ", "\u0307ii\u0307");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307", "i", "\u0307");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307", "\u0307", "i\u0307");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307", "i\u0307", "");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307i\u0307", "i\u0307", "");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307\u0307", "i\u0307", "");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307i", "i\u0307", "");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307i", "İ", "i\u0307i");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307İ", "i\u0307", "İ");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307İ", "İ", "i\u0307İ");
+ assertStringTrimLeft("UTF8_BINARY", "İ", "İ", "");
+ assertStringTrimLeft("UTF8_BINARY", "IXi", "İ", "IXi");
+ assertStringTrimLeft("UTF8_BINARY", "ix\u0307", "Ixİ", "ix\u0307");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307x", "IXİ", "i\u0307x");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307x", "ix\u0307İ", "");
+ assertStringTrimLeft("UTF8_BINARY", "İ", "i", "İ");
+ assertStringTrimLeft("UTF8_BINARY", "İ", "\u0307", "İ");
+ assertStringTrimLeft("UTF8_BINARY", "Ixİ", "i\u0307", "Ixİ");
+ assertStringTrimLeft("UTF8_BINARY", "IXİ", "ix\u0307", "IXİ");
+ assertStringTrimLeft("UTF8_BINARY", "xi\u0307", "\u0307IX", "xi\u0307");
+ assertStringTrimLeft("UTF8_LCASE", "i", "i", "");
+ assertStringTrimLeft("UTF8_LCASE", "iii", "I", "");
+ assertStringTrimLeft("UTF8_LCASE", "I", "iii", "");
+ assertStringTrimLeft("UTF8_LCASE", "ixi", "i", "xi");
+ assertStringTrimLeft("UTF8_LCASE", "i", "İ", "i");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307", "İ", "");
+ assertStringTrimLeft("UTF8_LCASE", "ii\u0307", "İi", "");
+ assertStringTrimLeft("UTF8_LCASE", "iii\u0307", "İi", "");
+ assertStringTrimLeft("UTF8_LCASE", "iiii\u0307", "iİ", "");
+ assertStringTrimLeft("UTF8_LCASE", "ii\u0307ii\u0307", "iİ", "");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307", "i", "\u0307");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307", "\u0307", "i\u0307");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307", "i\u0307", "");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307i\u0307", "i\u0307", "");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307\u0307", "i\u0307", "");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307i", "i\u0307", "");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307i", "İ", "i");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307İ", "i\u0307", "İ");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307İ", "İ", "");
+ assertStringTrimLeft("UTF8_LCASE", "İ", "İ", "");
+ assertStringTrimLeft("UTF8_LCASE", "IXi", "İ", "IXi");
+ assertStringTrimLeft("UTF8_LCASE", "ix\u0307", "Ixİ", "\u0307");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307x", "IXİ", "");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307x", "I\u0307xİ", "");
+ assertStringTrimLeft("UTF8_LCASE", "İ", "i", "İ");
+ assertStringTrimLeft("UTF8_LCASE", "İ", "\u0307", "İ");
+ assertStringTrimLeft("UTF8_LCASE", "Ixİ", "i\u0307", "xİ");
+ assertStringTrimLeft("UTF8_LCASE", "IXİ", "ix\u0307", "İ");
+ assertStringTrimLeft("UTF8_LCASE", "xi\u0307", "\u0307IX", "");
+ assertStringTrimLeft("UNICODE", "i", "i", "");
+ assertStringTrimLeft("UNICODE", "iii", "I", "iii");
+ assertStringTrimLeft("UNICODE", "I", "iii", "I");
+ assertStringTrimLeft("UNICODE", "ixi", "i", "xi");
+ assertStringTrimLeft("UNICODE", "i", "İ", "i");
+ assertStringTrimLeft("UNICODE", "i\u0307", "İ", "i\u0307");
+ assertStringTrimLeft("UNICODE", "ii\u0307", "İi", "i\u0307");
+ assertStringTrimLeft("UNICODE", "iii\u0307", "İi", "i\u0307");
+ assertStringTrimLeft("UNICODE", "iiii\u0307", "iİ", "i\u0307");
+ assertStringTrimLeft("UNICODE", "ii\u0307ii\u0307", "iİ", "i\u0307ii\u0307");
+ assertStringTrimLeft("UNICODE", "i\u0307", "i", "i\u0307");
+ assertStringTrimLeft("UNICODE", "i\u0307", "\u0307", "i\u0307");
+ assertStringTrimLeft("UNICODE", "i\u0307", "i\u0307", "i\u0307");
+ assertStringTrimLeft("UNICODE", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307");
+ assertStringTrimLeft("UNICODE", "i\u0307\u0307", "i\u0307", "i\u0307\u0307");
+ assertStringTrimLeft("UNICODE", "i\u0307i", "i\u0307", "i\u0307i");
+ assertStringTrimLeft("UNICODE", "i\u0307i", "İ", "i\u0307i");
+ assertStringTrimLeft("UNICODE", "i\u0307İ", "i\u0307", "i\u0307İ");
+ assertStringTrimLeft("UNICODE", "i\u0307İ", "İ", "i\u0307İ");
+ assertStringTrimLeft("UNICODE", "İ", "İ", "");
+ assertStringTrimLeft("UNICODE", "IXi", "İ", "IXi");
+ assertStringTrimLeft("UNICODE", "ix\u0307", "Ixİ", "ix\u0307");
+ assertStringTrimLeft("UNICODE", "i\u0307x", "IXİ", "i\u0307x");
+ assertStringTrimLeft("UNICODE", "i\u0307x", "ix\u0307İ", "i\u0307x");
+ assertStringTrimLeft("UNICODE", "İ", "i", "İ");
+ assertStringTrimLeft("UNICODE", "İ", "\u0307", "İ");
+ assertStringTrimLeft("UNICODE", "i\u0307", "i\u0307", "i\u0307");
+ assertStringTrimLeft("UNICODE", "Ixİ", "i\u0307", "Ixİ");
+ assertStringTrimLeft("UNICODE", "IXİ", "ix\u0307", "IXİ");
+ assertStringTrimLeft("UNICODE", "xi\u0307", "\u0307IX", "xi\u0307");
assertStringTrimLeft("UNICODE_CI", "i", "i", "");
assertStringTrimLeft("UNICODE_CI", "iii", "I", "");
assertStringTrimLeft("UNICODE_CI", "I", "iii", "");
assertStringTrimLeft("UNICODE_CI", "ixi", "i", "xi");
assertStringTrimLeft("UNICODE_CI", "i", "İ", "i");
assertStringTrimLeft("UNICODE_CI", "i\u0307", "İ", "");
+ assertStringTrimLeft("UNICODE_CI", "ii\u0307", "İi", "");
+ assertStringTrimLeft("UNICODE_CI", "iii\u0307", "İi", "");
+ assertStringTrimLeft("UNICODE_CI", "iiii\u0307", "iİ", "");
+ assertStringTrimLeft("UNICODE_CI", "ii\u0307ii\u0307", "iİ", "");
assertStringTrimLeft("UNICODE_CI", "i\u0307", "i", "i\u0307");
assertStringTrimLeft("UNICODE_CI", "i\u0307", "\u0307", "i\u0307");
assertStringTrimLeft("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307");
@@ -1768,12 +3254,283 @@ public void testStringTrim() throws SparkException {
assertStringTrimLeft("UNICODE_CI", "Ixİ", "i\u0307", "xİ");
assertStringTrimLeft("UNICODE_CI", "IXİ", "ix\u0307", "İ");
assertStringTrimLeft("UNICODE_CI", "xi\u0307", "\u0307IX", "i\u0307");
+ // Conditional case mapping (e.g. Greek sigmas).
+ assertStringTrimLeft("UTF8_BINARY", "ςxς", "σ", "ςxς");
+ assertStringTrimLeft("UTF8_BINARY", "ςxς", "ς", "xς");
+ assertStringTrimLeft("UTF8_BINARY", "ςxς", "Σ", "ςxς");
+ assertStringTrimLeft("UTF8_BINARY", "σxσ", "σ", "xσ");
+ assertStringTrimLeft("UTF8_BINARY", "σxσ", "ς", "σxσ");
+ assertStringTrimLeft("UTF8_BINARY", "σxσ", "Σ", "σxσ");
+ assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ");
+ assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ");
+ assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "Σ", "xΣ");
+ assertStringTrimLeft("UTF8_LCASE", "ςxς", "σ", "xς");
+ assertStringTrimLeft("UTF8_LCASE", "ςxς", "ς", "xς");
+ assertStringTrimLeft("UTF8_LCASE", "ςxς", "Σ", "xς");
+ assertStringTrimLeft("UTF8_LCASE", "σxσ", "σ", "xσ");
+ assertStringTrimLeft("UTF8_LCASE", "σxσ", "ς", "xσ");
+ assertStringTrimLeft("UTF8_LCASE", "σxσ", "Σ", "xσ");
+ assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "σ", "xΣ");
+ assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "ς", "xΣ");
+ assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "Σ", "xΣ");
+ assertStringTrimLeft("UNICODE", "ςxς", "σ", "ςxς");
+ assertStringTrimLeft("UNICODE", "ςxς", "ς", "xς");
+ assertStringTrimLeft("UNICODE", "ςxς", "Σ", "ςxς");
+ assertStringTrimLeft("UNICODE", "σxσ", "σ", "xσ");
+ assertStringTrimLeft("UNICODE", "σxσ", "ς", "σxσ");
+ assertStringTrimLeft("UNICODE", "σxσ", "Σ", "σxσ");
+ assertStringTrimLeft("UNICODE", "ΣxΣ", "σ", "ΣxΣ");
+ assertStringTrimLeft("UNICODE", "ΣxΣ", "ς", "ΣxΣ");
+ assertStringTrimLeft("UNICODE", "ΣxΣ", "Σ", "xΣ");
+ assertStringTrimLeft("UNICODE_CI", "ςxς", "σ", "xς");
+ assertStringTrimLeft("UNICODE_CI", "ςxς", "ς", "xς");
+ assertStringTrimLeft("UNICODE_CI", "ςxς", "Σ", "xς");
+ assertStringTrimLeft("UNICODE_CI", "σxσ", "σ", "xσ");
+ assertStringTrimLeft("UNICODE_CI", "σxσ", "ς", "xσ");
+ assertStringTrimLeft("UNICODE_CI", "σxσ", "Σ", "xσ");
+ assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "σ", "xΣ");
+ assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "ς", "xΣ");
+ assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "Σ", "xΣ");
+ // Unicode normalization.
+ assertStringTrimLeft("UTF8_BINARY", "åβγδa\u030A", "å", "βγδa\u030A");
+ assertStringTrimLeft("UTF8_LCASE", "åβγδa\u030A", "Å", "βγδa\u030A");
+ assertStringTrimLeft("UNICODE", "åβγδa\u030A", "å", "βγδa\u030A");
+ assertStringTrimLeft("UNICODE_CI", "åβγδa\u030A", "Å", "βγδa\u030A");
+ // Surrogate pairs.
+ assertStringTrimLeft("UTF8_BINARY", "a🙃b🙃c", "🙃", "a🙃b🙃c");
+ assertStringTrimLeft("UTF8_LCASE", "a🙃b🙃c", "🙃", "a🙃b🙃c");
+ assertStringTrimLeft("UNICODE", "a🙃b🙃c", "🙃", "a🙃b🙃c");
+ assertStringTrimLeft("UNICODE_CI", "a🙃b🙃c", "🙃", "a🙃b🙃c");
+ assertStringTrimLeft("UTF8_BINARY", "a🙃b🙃c", "a", "🙃b🙃c");
+ assertStringTrimLeft("UTF8_LCASE", "a🙃b🙃c", "a", "🙃b🙃c");
+ assertStringTrimLeft("UNICODE", "a🙃b🙃c", "a", "🙃b🙃c");
+ assertStringTrimLeft("UNICODE_CI", "a🙃b🙃c", "a", "🙃b🙃c");
+ assertStringTrimLeft("UTF8_BINARY", "a🙃b🙃c", "a🙃", "b🙃c");
+ assertStringTrimLeft("UTF8_LCASE", "a🙃b🙃c", "a🙃", "b🙃c");
+ assertStringTrimLeft("UNICODE", "a🙃b🙃c", "a🙃", "b🙃c");
+ assertStringTrimLeft("UNICODE_CI", "a🙃b🙃c", "a🙃", "b🙃c");
+ assertStringTrimLeft("UTF8_BINARY", "a🙃b🙃c", "a🙃b", "c");
+ assertStringTrimLeft("UTF8_LCASE", "a🙃b🙃c", "a🙃b", "c");
+ assertStringTrimLeft("UNICODE", "a🙃b🙃c", "a🙃b", "c");
+ assertStringTrimLeft("UNICODE_CI", "a🙃b🙃c", "a🙃b", "c");
+ assertStringTrimLeft("UTF8_BINARY", "a🙃b🙃c", "abc🙃", "");
+ assertStringTrimLeft("UTF8_LCASE", "a🙃b🙃c", "abc🙃", "");
+ assertStringTrimLeft("UNICODE", "a🙃b🙃c", "abc🙃", "");
+ assertStringTrimLeft("UNICODE_CI", "a🙃b🙃c", "abc🙃", "");
+ assertStringTrimLeft("UTF8_BINARY", "😀😆😃😄", "😆😃", "😀😆😃😄");
+ assertStringTrimLeft("UTF8_LCASE", "😀😆😃😄", "😆😃", "😀😆😃😄");
+ assertStringTrimLeft("UNICODE", "😀😆😃😄", "😆😃", "😀😆😃😄");
+ assertStringTrimLeft("UNICODE_CI", "😀😆😃😄", "😆😃", "😀😆😃😄");
+ assertStringTrimLeft("UTF8_BINARY", "😀😆😃😄", "😀😆", "😃😄");
+ assertStringTrimLeft("UTF8_LCASE", "😀😆😃😄", "😀😆", "😃😄");
+ assertStringTrimLeft("UNICODE", "😀😆😃😄", "😀😆", "😃😄");
+ assertStringTrimLeft("UNICODE_CI", "😀😆😃😄", "😀😆", "😃😄");
+ assertStringTrimLeft("UTF8_BINARY", "😀😆😃😄", "😀😆😃😄", "");
+ assertStringTrimLeft("UTF8_LCASE", "😀😆😃😄", "😀😆😃😄", "");
+ assertStringTrimLeft("UNICODE", "😀😆😃😄", "😀😆😃😄", "");
+ assertStringTrimLeft("UNICODE_CI", "😀😆😃😄", "😀😆😃😄", "");
+ assertStringTrimLeft("UTF8_BINARY", "𐐅", "𐐅", "");
+ assertStringTrimLeft("UTF8_LCASE", "𐐅", "𐐅", "");
+ assertStringTrimLeft("UNICODE", "𐐅", "𐐅", "");
+ assertStringTrimLeft("UNICODE_CI", "𐐅", "𐐅", "");
+ assertStringTrimLeft("UTF8_BINARY", "𐐅", "𐐭", "𐐅");
+ assertStringTrimLeft("UTF8_LCASE", "𐐅", "𐐭", "");
+ assertStringTrimLeft("UNICODE", "𐐅", "𐐭", "𐐅");
+ assertStringTrimLeft("UNICODE_CI", "𐐅", "𐐭", "");
+ assertStringTrimLeft("UTF8_BINARY", "𝔸", "𝔸", "");
+ assertStringTrimLeft("UTF8_LCASE", "𝔸", "𝔸", "");
+ assertStringTrimLeft("UNICODE", "𝔸", "𝔸", "");
+ assertStringTrimLeft("UNICODE_CI", "𝔸", "𝔸", "");
+ assertStringTrimLeft("UTF8_BINARY", "𝔸", "A", "𝔸");
+ assertStringTrimLeft("UTF8_LCASE", "𝔸", "A", "𝔸");
+ assertStringTrimLeft("UNICODE", "𝔸", "A", "𝔸");
+ assertStringTrimLeft("UNICODE_CI", "𝔸", "A", "");
+ assertStringTrimLeft("UTF8_BINARY", "𝔸", "a", "𝔸");
+ assertStringTrimLeft("UTF8_LCASE", "𝔸", "a", "𝔸");
+ assertStringTrimLeft("UNICODE", "𝔸", "a", "𝔸");
+ assertStringTrimLeft("UNICODE_CI", "𝔸", "a", "");
+ }
+
+ /**
+ * Verify the behaviour of the `StringTrimRight` collation support class.
+ */
+
+ private void assertStringTrimRight(String collationName, String sourceString, String trimString,
+ String expected) throws SparkException {
+ // Prepare the input and expected result.
+ int collationId = CollationFactory.collationNameToId(collationName);
+ UTF8String src = UTF8String.fromString(sourceString);
+ UTF8String trim = UTF8String.fromString(trimString);
+ UTF8String result;
+
+ if (trimString == null) {
+ // Trim string is ASCII space.
+ result = CollationSupport.StringTrimRight.exec(src);
+ } else {
+ // Trim string is specified.
+ result = CollationSupport.StringTrimRight.exec(src, trim, collationId);
+ }
+
+ // Test that StringTrimRight result is as expected.
+ assertEquals(UTF8String.fromString(expected), result);
+ }
+
+ @Test
+ public void testStringTrimRight() throws SparkException {
+ // Basic tests.
+ assertStringTrimRight("UTF8_BINARY", "", "", "");
+ assertStringTrimRight("UTF8_BINARY", "", "xyz", "");
+ assertStringTrimRight("UTF8_BINARY", "asd", "", "asd");
+ assertStringTrimRight("UTF8_BINARY", "asd", null, "asd");
+ assertStringTrimRight("UTF8_BINARY", " asd ", null, " asd");
+ assertStringTrimRight("UTF8_BINARY", " a世a ", null, " a世a");
+ assertStringTrimRight("UTF8_BINARY", "asd", "x", "asd");
+ assertStringTrimRight("UTF8_BINARY", "xxasdxx", "x", "xxasd");
+ assertStringTrimRight("UTF8_BINARY", "xa世ax", "x", "xa世a");
+ assertStringTrimRight("UTF8_LCASE", "", "", "");
+ assertStringTrimRight("UTF8_LCASE", "", "xyz", "");
+ assertStringTrimRight("UTF8_LCASE", "asd", "", "asd");
+ assertStringTrimRight("UTF8_LCASE", "asd", null, "asd");
+ assertStringTrimRight("UTF8_LCASE", " asd ", null, " asd");
+ assertStringTrimRight("UTF8_LCASE", " a世a ", null, " a世a");
+ assertStringTrimRight("UTF8_LCASE", "asd", "x", "asd");
+ assertStringTrimRight("UTF8_LCASE", "xxasdxx", "x", "xxasd");
+ assertStringTrimRight("UTF8_LCASE", "xa世ax", "x", "xa世a");
+ assertStringTrimRight("UNICODE", "", "", "");
+ assertStringTrimRight("UNICODE", "", "xyz", "");
+ assertStringTrimRight("UNICODE", "asd", "", "asd");
+ assertStringTrimRight("UNICODE", "asd", null, "asd");
+ assertStringTrimRight("UNICODE", " asd ", null, " asd");
+ assertStringTrimRight("UNICODE", " a世a ", null, " a世a");
+ assertStringTrimRight("UNICODE", "asd", "x", "asd");
+ assertStringTrimRight("UNICODE", "xxasdxx", "x", "xxasd");
+ assertStringTrimRight("UNICODE", "xa世ax", "x", "xa世a");
+ assertStringTrimRight("UNICODE_CI", "", "", "");
+ assertStringTrimRight("UNICODE_CI", "", "xyz", "");
+ assertStringTrimRight("UNICODE_CI", "asd", "", "asd");
+ assertStringTrimRight("UNICODE_CI", "asd", null, "asd");
+ assertStringTrimRight("UNICODE_CI", " asd ", null, " asd");
+ assertStringTrimRight("UNICODE_CI", " a世a ", null, " a世a");
+ assertStringTrimRight("UNICODE_CI", "asd", "x", "asd");
+ assertStringTrimRight("UNICODE_CI", "xxasdxx", "x", "xxasd");
+ assertStringTrimRight("UNICODE_CI", "xa世ax", "x", "xa世a");
+ // Case variation.
+ assertStringTrimRight("UTF8_BINARY", "ddsXXXaa", "asd", "ddsXXX");
+ assertStringTrimRight("UTF8_LCASE", "ddsXXXaa", "AsD", "ddsXXX");
+ assertStringTrimRight("UNICODE", "ddsXXXaa", "asd", "ddsXXX");
+ assertStringTrimRight("UNICODE_CI", "ddsXXXaa", "AsD", "ddsXXX");
+ // One-to-many case mapping (e.g. Turkish dotted I)..
+ assertStringTrimRight("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ");
+ assertStringTrimRight("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß");
+ assertStringTrimRight("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "Ëaaa");
+ assertStringTrimRight("UTF8_LCASE", "ẞaaaẞ", "ß", "ẞaaa");
+ assertStringTrimRight("UTF8_LCASE", "ßaaaß", "ẞ", "ßaaa");
+ assertStringTrimRight("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "Ëaaa");
+ assertStringTrimRight("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ");
+ assertStringTrimRight("UNICODE", "ßaaaß", "ẞ", "ßaaaß");
+ assertStringTrimRight("UNICODE", "Ëaaaẞ", "Ëẞ", "Ëaaa");
+ assertStringTrimRight("UNICODE_CI", "ẞaaaẞ", "ß", "ẞaaa");
+ assertStringTrimRight("UNICODE_CI", "ßaaaß", "ẞ", "ßaaa");
+ assertStringTrimRight("UNICODE_CI", "Ëaaaẞ", "Ëẞ", "Ëaaa");
+ // One-to-many case mapping (e.g. Turkish dotted I).
+ assertStringTrimRight("UTF8_BINARY", "i", "i", "");
+ assertStringTrimRight("UTF8_BINARY", "iii", "I", "iii");
+ assertStringTrimRight("UTF8_BINARY", "I", "iii", "I");
+ assertStringTrimRight("UTF8_BINARY", "ixi", "i", "ix");
+ assertStringTrimRight("UTF8_BINARY", "i", "İ", "i");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307", "İ", "i\u0307");
+ assertStringTrimRight("UTF8_BINARY", "ii\u0307", "İi", "ii\u0307");
+ assertStringTrimRight("UTF8_BINARY", "iii\u0307", "İi", "iii\u0307");
+ assertStringTrimRight("UTF8_BINARY", "iiii\u0307", "iİ", "iiii\u0307");
+ assertStringTrimRight("UTF8_BINARY", "ii\u0307ii\u0307", "iİ", "ii\u0307ii\u0307");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307", "i", "i\u0307");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307", "\u0307", "i");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307", "i\u0307", "");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307i\u0307", "i\u0307", "");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307\u0307", "i\u0307", "");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307i", "i\u0307", "");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307i", "İ", "i\u0307i");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307İ", "i\u0307", "i\u0307İ");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307İ", "İ", "i\u0307");
+ assertStringTrimRight("UTF8_BINARY", "İ", "İ", "");
+ assertStringTrimRight("UTF8_BINARY", "IXi", "İ", "IXi");
+ assertStringTrimRight("UTF8_BINARY", "ix\u0307", "Ixİ", "ix\u0307");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307x", "IXİ", "i\u0307x");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307x", "ix\u0307İ", "");
+ assertStringTrimRight("UTF8_BINARY", "İ", "i", "İ");
+ assertStringTrimRight("UTF8_BINARY", "İ", "\u0307", "İ");
+ assertStringTrimRight("UTF8_BINARY", "Ixİ", "i\u0307", "Ixİ");
+ assertStringTrimRight("UTF8_BINARY", "IXİ", "ix\u0307", "IXİ");
+ assertStringTrimRight("UTF8_BINARY", "xi\u0307", "\u0307IX", "xi");
+ assertStringTrimRight("UTF8_LCASE", "i", "i", "");
+ assertStringTrimRight("UTF8_LCASE", "iii", "I", "");
+ assertStringTrimRight("UTF8_LCASE", "I", "iii", "");
+ assertStringTrimRight("UTF8_LCASE", "ixi", "i", "ix");
+ assertStringTrimRight("UTF8_LCASE", "i", "İ", "i");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307", "İ", "");
+ assertStringTrimRight("UTF8_LCASE", "ii\u0307", "İi", "");
+ assertStringTrimRight("UTF8_LCASE", "iii\u0307", "İi", "");
+ assertStringTrimRight("UTF8_LCASE", "iiii\u0307", "iİ", "");
+ assertStringTrimRight("UTF8_LCASE", "ii\u0307ii\u0307", "iİ", "");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307", "i", "i\u0307");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307", "\u0307", "i");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307", "i\u0307", "");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307i\u0307", "i\u0307", "");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307\u0307", "i\u0307", "");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307i", "i\u0307", "");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307i", "İ", "i\u0307i");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307İ", "i\u0307", "i\u0307İ");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307İ", "İ", "");
+ assertStringTrimRight("UTF8_LCASE", "İ", "İ", "");
+ assertStringTrimRight("UTF8_LCASE", "IXi", "İ", "IXi");
+ assertStringTrimRight("UTF8_LCASE", "ix\u0307", "Ixİ", "ix\u0307");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307x", "IXİ", "");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307x", "I\u0307xİ", "");
+ assertStringTrimRight("UTF8_LCASE", "İ", "i", "İ");
+ assertStringTrimRight("UTF8_LCASE", "İ", "\u0307", "İ");
+ assertStringTrimRight("UTF8_LCASE", "Ixİ", "i\u0307", "Ixİ");
+ assertStringTrimRight("UTF8_LCASE", "IXİ", "ix\u0307", "IXİ");
+ assertStringTrimRight("UTF8_LCASE", "xi\u0307", "\u0307IX", "");
+ assertStringTrimRight("UNICODE", "i", "i", "");
+ assertStringTrimRight("UNICODE", "iii", "I", "iii");
+ assertStringTrimRight("UNICODE", "I", "iii", "I");
+ assertStringTrimRight("UNICODE", "ixi", "i", "ix");
+ assertStringTrimRight("UNICODE", "i", "İ", "i");
+ assertStringTrimRight("UNICODE", "i\u0307", "İ", "i\u0307");
+ assertStringTrimRight("UTF8_BINARY", "ii\u0307", "İi", "ii\u0307");
+ assertStringTrimRight("UTF8_BINARY", "iii\u0307", "İi", "iii\u0307");
+ assertStringTrimRight("UTF8_BINARY", "iiii\u0307", "iİ", "iiii\u0307");
+ assertStringTrimRight("UTF8_BINARY", "ii\u0307ii\u0307", "iİ", "ii\u0307ii\u0307");
+ assertStringTrimRight("UNICODE", "i\u0307", "i", "i\u0307");
+ assertStringTrimRight("UNICODE", "i\u0307", "\u0307", "i\u0307");
+ assertStringTrimRight("UNICODE", "i\u0307", "i\u0307", "i\u0307");
+ assertStringTrimRight("UNICODE", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307");
+ assertStringTrimRight("UNICODE", "i\u0307\u0307", "i\u0307", "i\u0307\u0307");
+ assertStringTrimRight("UNICODE", "i\u0307i", "i\u0307", "i\u0307");
+ assertStringTrimRight("UNICODE", "i\u0307i", "İ", "i\u0307i");
+ assertStringTrimRight("UNICODE", "i\u0307İ", "i\u0307", "i\u0307İ");
+ assertStringTrimRight("UNICODE", "i\u0307İ", "İ", "i\u0307");
+ assertStringTrimRight("UNICODE", "İ", "İ", "");
+ assertStringTrimRight("UNICODE", "IXi", "İ", "IXi");
+ assertStringTrimRight("UNICODE", "ix\u0307", "Ixİ", "ix\u0307");
+ assertStringTrimRight("UNICODE", "i\u0307x", "IXİ", "i\u0307x");
+ assertStringTrimRight("UNICODE", "i\u0307x", "ix\u0307İ", "i\u0307");
+ assertStringTrimRight("UNICODE", "İ", "i", "İ");
+ assertStringTrimRight("UNICODE", "İ", "\u0307", "İ");
+ assertStringTrimRight("UNICODE", "i\u0307", "i\u0307", "i\u0307");
+ assertStringTrimRight("UNICODE", "Ixİ", "i\u0307", "Ixİ");
+ assertStringTrimRight("UNICODE", "IXİ", "ix\u0307", "IXİ");
+ assertStringTrimRight("UNICODE", "xi\u0307", "\u0307IX", "xi\u0307");
assertStringTrimRight("UNICODE_CI", "i", "i", "");
assertStringTrimRight("UNICODE_CI", "iii", "I", "");
assertStringTrimRight("UNICODE_CI", "I", "iii", "");
assertStringTrimRight("UNICODE_CI", "ixi", "i", "ix");
assertStringTrimRight("UNICODE_CI", "i", "İ", "i");
assertStringTrimRight("UNICODE_CI", "i\u0307", "İ", "");
+ assertStringTrimRight("UNICODE_CI", "ii\u0307", "İi", "");
+ assertStringTrimRight("UNICODE_CI", "iii\u0307", "İi", "");
+ assertStringTrimRight("UNICODE_CI", "iiii\u0307", "iİ", "");
+ assertStringTrimRight("UNICODE_CI", "ii\u0307ii\u0307", "iİ", "");
assertStringTrimRight("UNICODE_CI", "i\u0307", "i", "i\u0307");
assertStringTrimRight("UNICODE_CI", "i\u0307", "\u0307", "i\u0307");
assertStringTrimRight("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307");
@@ -1791,29 +3548,10 @@ public void testStringTrim() throws SparkException {
assertStringTrimRight("UNICODE_CI", "İ", "i", "İ");
assertStringTrimRight("UNICODE_CI", "İ", "\u0307", "İ");
assertStringTrimRight("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307");
- assertStringTrimRight("UNICODE_CI", "Ixİ", "i\u0307", "Ixİ");
- assertStringTrimRight("UNICODE_CI", "IXİ", "ix\u0307", "IXİ");
- assertStringTrimRight("UNICODE_CI", "xi\u0307", "\u0307IX", "xi\u0307");
-
- // Greek sigmas - UTF8_BINARY.
- assertStringTrim("UTF8_BINARY", "ςxς", "σ", "ςxς");
- assertStringTrim("UTF8_BINARY", "ςxς", "ς", "x");
- assertStringTrim("UTF8_BINARY", "ςxς", "Σ", "ςxς");
- assertStringTrim("UTF8_BINARY", "σxσ", "σ", "x");
- assertStringTrim("UTF8_BINARY", "σxσ", "ς", "σxσ");
- assertStringTrim("UTF8_BINARY", "σxσ", "Σ", "σxσ");
- assertStringTrim("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ");
- assertStringTrim("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ");
- assertStringTrim("UTF8_BINARY", "ΣxΣ", "Σ", "x");
- assertStringTrimLeft("UTF8_BINARY", "ςxς", "σ", "ςxς");
- assertStringTrimLeft("UTF8_BINARY", "ςxς", "ς", "xς");
- assertStringTrimLeft("UTF8_BINARY", "ςxς", "Σ", "ςxς");
- assertStringTrimLeft("UTF8_BINARY", "σxσ", "σ", "xσ");
- assertStringTrimLeft("UTF8_BINARY", "σxσ", "ς", "σxσ");
- assertStringTrimLeft("UTF8_BINARY", "σxσ", "Σ", "σxσ");
- assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ");
- assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ");
- assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "Σ", "xΣ");
+ assertStringTrimRight("UNICODE_CI", "Ixİ", "i\u0307", "Ixİ");
+ assertStringTrimRight("UNICODE_CI", "IXİ", "ix\u0307", "IXİ");
+ assertStringTrimRight("UNICODE_CI", "xi\u0307", "\u0307IX", "xi\u0307");
+ // Conditional case mapping (e.g. Greek sigmas).
assertStringTrimRight("UTF8_BINARY", "ςxς", "σ", "ςxς");
assertStringTrimRight("UTF8_BINARY", "ςxς", "ς", "ςx");
assertStringTrimRight("UTF8_BINARY", "ςxς", "Σ", "ςxς");
@@ -1823,25 +3561,6 @@ public void testStringTrim() throws SparkException {
assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ");
assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ");
assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "Σ", "Σx");
- // Greek sigmas - UTF8_LCASE.
- assertStringTrim("UTF8_LCASE", "ςxς", "σ", "x");
- assertStringTrim("UTF8_LCASE", "ςxς", "ς", "x");
- assertStringTrim("UTF8_LCASE", "ςxς", "Σ", "x");
- assertStringTrim("UTF8_LCASE", "σxσ", "σ", "x");
- assertStringTrim("UTF8_LCASE", "σxσ", "ς", "x");
- assertStringTrim("UTF8_LCASE", "σxσ", "Σ", "x");
- assertStringTrim("UTF8_LCASE", "ΣxΣ", "σ", "x");
- assertStringTrim("UTF8_LCASE", "ΣxΣ", "ς", "x");
- assertStringTrim("UTF8_LCASE", "ΣxΣ", "Σ", "x");
- assertStringTrimLeft("UTF8_LCASE", "ςxς", "σ", "xς");
- assertStringTrimLeft("UTF8_LCASE", "ςxς", "ς", "xς");
- assertStringTrimLeft("UTF8_LCASE", "ςxς", "Σ", "xς");
- assertStringTrimLeft("UTF8_LCASE", "σxσ", "σ", "xσ");
- assertStringTrimLeft("UTF8_LCASE", "σxσ", "ς", "xσ");
- assertStringTrimLeft("UTF8_LCASE", "σxσ", "Σ", "xσ");
- assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "σ", "xΣ");
- assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "ς", "xΣ");
- assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "Σ", "xΣ");
assertStringTrimRight("UTF8_LCASE", "ςxς", "σ", "ςx");
assertStringTrimRight("UTF8_LCASE", "ςxς", "ς", "ςx");
assertStringTrimRight("UTF8_LCASE", "ςxς", "Σ", "ςx");
@@ -1851,25 +3570,6 @@ public void testStringTrim() throws SparkException {
assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "σ", "Σx");
assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "ς", "Σx");
assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "Σ", "Σx");
- // Greek sigmas - UNICODE.
- assertStringTrim("UNICODE", "ςxς", "σ", "ςxς");
- assertStringTrim("UNICODE", "ςxς", "ς", "x");
- assertStringTrim("UNICODE", "ςxς", "Σ", "ςxς");
- assertStringTrim("UNICODE", "σxσ", "σ", "x");
- assertStringTrim("UNICODE", "σxσ", "ς", "σxσ");
- assertStringTrim("UNICODE", "σxσ", "Σ", "σxσ");
- assertStringTrim("UNICODE", "ΣxΣ", "σ", "ΣxΣ");
- assertStringTrim("UNICODE", "ΣxΣ", "ς", "ΣxΣ");
- assertStringTrim("UNICODE", "ΣxΣ", "Σ", "x");
- assertStringTrimLeft("UNICODE", "ςxς", "σ", "ςxς");
- assertStringTrimLeft("UNICODE", "ςxς", "ς", "xς");
- assertStringTrimLeft("UNICODE", "ςxς", "Σ", "ςxς");
- assertStringTrimLeft("UNICODE", "σxσ", "σ", "xσ");
- assertStringTrimLeft("UNICODE", "σxσ", "ς", "σxσ");
- assertStringTrimLeft("UNICODE", "σxσ", "Σ", "σxσ");
- assertStringTrimLeft("UNICODE", "ΣxΣ", "σ", "ΣxΣ");
- assertStringTrimLeft("UNICODE", "ΣxΣ", "ς", "ΣxΣ");
- assertStringTrimLeft("UNICODE", "ΣxΣ", "Σ", "xΣ");
assertStringTrimRight("UNICODE", "ςxς", "σ", "ςxς");
assertStringTrimRight("UNICODE", "ςxς", "ς", "ςx");
assertStringTrimRight("UNICODE", "ςxς", "Σ", "ςxς");
@@ -1879,25 +3579,6 @@ public void testStringTrim() throws SparkException {
assertStringTrimRight("UNICODE", "ΣxΣ", "σ", "ΣxΣ");
assertStringTrimRight("UNICODE", "ΣxΣ", "ς", "ΣxΣ");
assertStringTrimRight("UNICODE", "ΣxΣ", "Σ", "Σx");
- // Greek sigmas - UNICODE_CI.
- assertStringTrim("UNICODE_CI", "ςxς", "σ", "x");
- assertStringTrim("UNICODE_CI", "ςxς", "ς", "x");
- assertStringTrim("UNICODE_CI", "ςxς", "Σ", "x");
- assertStringTrim("UNICODE_CI", "σxσ", "σ", "x");
- assertStringTrim("UNICODE_CI", "σxσ", "ς", "x");
- assertStringTrim("UNICODE_CI", "σxσ", "Σ", "x");
- assertStringTrim("UNICODE_CI", "ΣxΣ", "σ", "x");
- assertStringTrim("UNICODE_CI", "ΣxΣ", "ς", "x");
- assertStringTrim("UNICODE_CI", "ΣxΣ", "Σ", "x");
- assertStringTrimLeft("UNICODE_CI", "ςxς", "σ", "xς");
- assertStringTrimLeft("UNICODE_CI", "ςxς", "ς", "xς");
- assertStringTrimLeft("UNICODE_CI", "ςxς", "Σ", "xς");
- assertStringTrimLeft("UNICODE_CI", "σxσ", "σ", "xσ");
- assertStringTrimLeft("UNICODE_CI", "σxσ", "ς", "xσ");
- assertStringTrimLeft("UNICODE_CI", "σxσ", "Σ", "xσ");
- assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "σ", "xΣ");
- assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "ς", "xΣ");
- assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "Σ", "xΣ");
assertStringTrimRight("UNICODE_CI", "ςxς", "σ", "ςx");
assertStringTrimRight("UNICODE_CI", "ςxς", "ς", "ςx");
assertStringTrimRight("UNICODE_CI", "ςxς", "Σ", "ςx");
@@ -1907,186 +3588,287 @@ public void testStringTrim() throws SparkException {
assertStringTrimRight("UNICODE_CI", "ΣxΣ", "σ", "Σx");
assertStringTrimRight("UNICODE_CI", "ΣxΣ", "ς", "Σx");
assertStringTrimRight("UNICODE_CI", "ΣxΣ", "Σ", "Σx");
-
- // Unicode normalization - UTF8_BINARY.
- assertStringTrim("UTF8_BINARY", "åβγδa\u030A", "å", "βγδa\u030A");
- assertStringTrimLeft("UTF8_BINARY", "åβγδa\u030A", "å", "βγδa\u030A");
+ // Unicode normalization.
assertStringTrimRight("UTF8_BINARY", "åβγδa\u030A", "å", "åβγδa\u030A");
- // Unicode normalization - UTF8_LCASE.
- assertStringTrim("UTF8_LCASE", "åβγδa\u030A", "Å", "βγδa\u030A");
- assertStringTrimLeft("UTF8_LCASE", "åβγδa\u030A", "Å", "βγδa\u030A");
assertStringTrimRight("UTF8_LCASE", "åβγδa\u030A", "Å", "åβγδa\u030A");
- // Unicode normalization - UNICODE.
- assertStringTrim("UNICODE", "åβγδa\u030A", "å", "βγδ");
- assertStringTrimLeft("UNICODE", "åβγδa\u030A", "å", "βγδa\u030A");
assertStringTrimRight("UNICODE", "åβγδa\u030A", "å", "åβγδ");
- // Unicode normalization - UNICODE_CI.
- assertStringTrim("UNICODE_CI", "åβγδa\u030A", "Å", "βγδ");
- assertStringTrimLeft("UNICODE_CI", "åβγδa\u030A", "Å", "βγδa\u030A");
assertStringTrimRight("UNICODE_CI", "åβγδa\u030A", "Å", "åβγδ");
+ // Surrogate pairs.
+ assertStringTrimRight("UTF8_BINARY", "a🙃b🙃c", "🙃", "a🙃b🙃c");
+ assertStringTrimRight("UTF8_LCASE", "a🙃b🙃c", "🙃", "a🙃b🙃c");
+ assertStringTrimRight("UNICODE", "a🙃b🙃c", "🙃", "a🙃b🙃c");
+ assertStringTrimRight("UNICODE_CI", "a🙃b🙃c", "🙃", "a🙃b🙃c");
+ assertStringTrimRight("UTF8_BINARY", "a🙃b🙃c", "c", "a🙃b🙃");
+ assertStringTrimRight("UTF8_LCASE", "a🙃b🙃c", "c", "a🙃b🙃");
+ assertStringTrimRight("UNICODE", "a🙃b🙃c", "c", "a🙃b🙃");
+ assertStringTrimRight("UNICODE_CI", "a🙃b🙃c", "c", "a🙃b🙃");
+ assertStringTrimRight("UTF8_BINARY", "a🙃b🙃c", "c🙃", "a🙃b");
+ assertStringTrimRight("UTF8_LCASE", "a🙃b🙃c", "c🙃", "a🙃b");
+ assertStringTrimRight("UNICODE", "a🙃b🙃c", "c🙃", "a🙃b");
+ assertStringTrimRight("UNICODE_CI", "a🙃b🙃c", "c🙃", "a🙃b");
+ assertStringTrimRight("UTF8_BINARY", "a🙃b🙃c", "c🙃b", "a");
+ assertStringTrimRight("UTF8_LCASE", "a🙃b🙃c", "c🙃b", "a");
+ assertStringTrimRight("UNICODE", "a🙃b🙃c", "c🙃b", "a");
+ assertStringTrimRight("UNICODE_CI", "a🙃b🙃c", "c🙃b", "a");
+ assertStringTrimRight("UTF8_BINARY", "a🙃b🙃c", "abc🙃", "");
+ assertStringTrimRight("UTF8_LCASE", "a🙃b🙃c", "abc🙃", "");
+ assertStringTrimRight("UNICODE", "a🙃b🙃c", "abc🙃", "");
+ assertStringTrimRight("UNICODE_CI", "a🙃b🙃c", "abc🙃", "");
+ assertStringTrimRight("UTF8_BINARY", "😀😆😃😄", "😆😃", "😀😆😃😄");
+ assertStringTrimRight("UTF8_LCASE", "😀😆😃😄", "😆😃", "😀😆😃😄");
+ assertStringTrimRight("UNICODE", "😀😆😃😄", "😆😃", "😀😆😃😄");
+ assertStringTrimRight("UNICODE_CI", "😀😆😃😄", "😆😃", "😀😆😃😄");
+ assertStringTrimRight("UTF8_BINARY", "😀😆😃😄", "😃😄", "😀😆");
+ assertStringTrimRight("UTF8_LCASE", "😀😆😃😄", "😃😄", "😀😆");
+ assertStringTrimRight("UNICODE", "😀😆😃😄", "😃😄", "😀😆");
+ assertStringTrimRight("UNICODE_CI", "😀😆😃😄", "😃😄", "😀😆");
+ assertStringTrimRight("UTF8_BINARY", "😀😆😃😄", "😀😆😃😄", "");
+ assertStringTrimRight("UTF8_LCASE", "😀😆😃😄", "😀😆😃😄", "");
+ assertStringTrimRight("UNICODE", "😀😆😃😄", "😀😆😃😄", "");
+ assertStringTrimRight("UNICODE_CI", "😀😆😃😄", "😀😆😃😄", "");
+ assertStringTrimRight("UTF8_BINARY", "𐐅", "𐐅", "");
+ assertStringTrimRight("UTF8_LCASE", "𐐅", "𐐅", "");
+ assertStringTrimRight("UNICODE", "𐐅", "𐐅", "");
+ assertStringTrimRight("UNICODE_CI", "𐐅", "𐐅", "");
+ assertStringTrimRight("UTF8_BINARY", "𐐅", "𐐭", "𐐅");
+ assertStringTrimRight("UTF8_LCASE", "𐐅", "𐐭", "");
+ assertStringTrimRight("UNICODE", "𐐅", "𐐭", "𐐅");
+ assertStringTrimRight("UNICODE_CI", "𐐅", "𐐭", "");
+ assertStringTrimRight("UTF8_BINARY", "𝔸", "𝔸", "");
+ assertStringTrimRight("UTF8_LCASE", "𝔸", "𝔸", "");
+ assertStringTrimRight("UNICODE", "𝔸", "𝔸", "");
+ assertStringTrimRight("UNICODE_CI", "𝔸", "𝔸", "");
+ assertStringTrimRight("UTF8_BINARY", "𝔸", "A", "𝔸");
+ assertStringTrimRight("UTF8_LCASE", "𝔸", "A", "𝔸");
+ assertStringTrimRight("UNICODE", "𝔸", "A", "𝔸");
+ assertStringTrimRight("UNICODE_CI", "𝔸", "A", "");
+ assertStringTrimRight("UTF8_BINARY", "𝔸", "a", "𝔸");
+ assertStringTrimRight("UTF8_LCASE", "𝔸", "a", "𝔸");
+ assertStringTrimRight("UNICODE", "𝔸", "a", "𝔸");
+ assertStringTrimRight("UNICODE_CI", "𝔸", "a", "");
}
- private void assertStringTranslate(
- String inputString,
- String matchingString,
- String replaceString,
- String collationName,
- String expectedResultString) throws SparkException {
+ /**
+ * Verify the behaviour of the `StringTranslate` collation support class.
+ */
+
+ private void assertStringTranslate(String inputString, String matchingString,
+ String replaceString, String collationName, String expected) throws SparkException {
int collationId = CollationFactory.collationNameToId(collationName);
Map dict = buildDict(matchingString, replaceString);
UTF8String source = UTF8String.fromString(inputString);
UTF8String result = CollationSupport.StringTranslate.exec(source, dict, collationId);
- assertEquals(expectedResultString, result.toString());
+ assertEquals(UTF8String.fromString(expected), result);
}
@Test
public void testStringTranslate() throws SparkException {
- // Basic tests - UTF8_BINARY.
+ // Empty strings.
+ assertStringTranslate("", "", "", "UTF8_BINARY", "");
+ assertStringTranslate("", "", "", "UTF8_LCASE", "");
+ assertStringTranslate("", "", "", "UNICODE", "");
+ assertStringTranslate("", "", "", "UNICODE_CI", "");
+ assertStringTranslate("abc", "", "", "UTF8_BINARY", "abc");
+ assertStringTranslate("abc", "", "", "UTF8_LCASE", "abc");
+ assertStringTranslate("abc", "", "", "UNICODE", "abc");
+ assertStringTranslate("abc", "", "", "UNICODE_CI", "abc");
+ assertStringTranslate("", "b", "", "UTF8_BINARY", "");
+ assertStringTranslate("", "b", "", "UTF8_LCASE", "");
+ assertStringTranslate("", "b", "", "UNICODE", "");
+ assertStringTranslate("", "b", "", "UNICODE_CI", "");
+ assertStringTranslate("", "", "x", "UTF8_BINARY", "");
+ assertStringTranslate("", "", "x", "UTF8_LCASE", "");
+ assertStringTranslate("", "", "x", "UNICODE", "");
+ assertStringTranslate("", "", "x", "UNICODE_CI", "");
+ assertStringTranslate("abc", "b", "", "UTF8_BINARY", "ac");
+ assertStringTranslate("abc", "b", "", "UTF8_LCASE", "ac");
+ assertStringTranslate("abc", "b", "", "UNICODE", "ac");
+ assertStringTranslate("abc", "b", "", "UNICODE_CI", "ac");
+ assertStringTranslate("abc", "", "x", "UTF8_BINARY", "abc");
+ assertStringTranslate("abc", "", "x", "UTF8_LCASE", "abc");
+ assertStringTranslate("abc", "", "x", "UNICODE", "abc");
+ assertStringTranslate("abc", "", "x", "UNICODE_CI", "abc");
+ assertStringTranslate("", "b", "x", "UTF8_BINARY", "");
+ assertStringTranslate("", "b", "x", "UTF8_LCASE", "");
+ assertStringTranslate("", "b", "x", "UNICODE", "");
+ assertStringTranslate("", "b", "x", "UNICODE_CI", "");
+ // Basic tests.
+ assertStringTranslate("abc", "b", "x", "UTF8_BINARY", "axc");
+ assertStringTranslate("abc", "b", "x", "UTF8_LCASE", "axc");
+ assertStringTranslate("abc", "b", "x", "UNICODE", "axc");
+ assertStringTranslate("abc", "b", "x", "UNICODE_CI", "axc");
assertStringTranslate("Translate", "Rnlt", "12", "UTF8_BINARY", "Tra2sae");
- assertStringTranslate("Translate", "Rn", "1234", "UTF8_BINARY", "Tra2slate");
- assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_BINARY", "Tra2s3a4e");
- assertStringTranslate("TRanslate", "rnlt", "XxXx", "UTF8_BINARY", "TRaxsXaxe");
- assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UTF8_BINARY", "TxaxsXaxeX");
- assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UTF8_BINARY", "TXaxsXaxex");
- assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UTF8_BINARY", "test大千世AX大千世A");
- assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UTF8_BINARY", "大千世界test大千世界");
- assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UTF8_BINARY", "Oeso大千世界大千世界");
- assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UTF8_BINARY", "大千世界大千世界oesO");
- assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UTF8_BINARY", "世世世界世世世界tesT");
- assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UTF8_BINARY", "Tr4234e");
- assertStringTranslate("Translate", "Rnlt", "123495834634", "UTF8_BINARY", "Tra2s3a4e");
- assertStringTranslate("abcdef", "abcde", "123", "UTF8_BINARY", "123f");
- // Basic tests - UTF8_LCASE.
assertStringTranslate("Translate", "Rnlt", "12", "UTF8_LCASE", "1a2sae");
- assertStringTranslate("Translate", "Rn", "1234", "UTF8_LCASE", "T1a2slate");
- assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_LCASE", "41a2s3a4e");
- assertStringTranslate("TRanslate", "rnlt", "XxXx", "UTF8_LCASE", "xXaxsXaxe");
- assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UTF8_LCASE", "xxaxsXaxex");
- assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UTF8_LCASE", "xXaxsXaxeX");
- assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UTF8_LCASE", "test大千世AB大千世A");
- assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UTF8_LCASE", "大千世界abca大千世界");
- assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UTF8_LCASE", "oeso大千世界大千世界");
- assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UTF8_LCASE", "大千世界大千世界OesO");
- assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UTF8_LCASE", "世世世界世世世界tesT");
- assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UTF8_LCASE", "14234e");
- assertStringTranslate("Translate", "Rnlt", "123495834634", "UTF8_LCASE", "41a2s3a4e");
- assertStringTranslate("abcdef", "abcde", "123", "UTF8_LCASE", "123f");
- // Basic tests - UNICODE.
assertStringTranslate("Translate", "Rnlt", "12", "UNICODE", "Tra2sae");
- assertStringTranslate("Translate", "Rn", "1234", "UNICODE", "Tra2slate");
- assertStringTranslate("Translate", "Rnlt", "1234", "UNICODE", "Tra2s3a4e");
- assertStringTranslate("TRanslate", "rnlt", "XxXx", "UNICODE", "TRaxsXaxe");
- assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UNICODE", "TxaxsXaxeX");
- assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UNICODE", "TXaxsXaxex");
- assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UNICODE", "test大千世AX大千世A");
- assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UNICODE", "大千世界test大千世界");
- assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UNICODE", "Oeso大千世界大千世界");
- assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UNICODE", "大千世界大千世界oesO");
- assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UNICODE", "世世世界世世世界tesT");
- assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UNICODE", "Tr4234e");
- assertStringTranslate("Translate", "Rnlt", "123495834634", "UNICODE", "Tra2s3a4e");
- assertStringTranslate("abcdef", "abcde", "123", "UNICODE", "123f");
- // Basic tests - UNICODE_CI.
assertStringTranslate("Translate", "Rnlt", "12", "UNICODE_CI", "1a2sae");
+ assertStringTranslate("Translate", "Rn", "1234", "UTF8_BINARY", "Tra2slate");
+ assertStringTranslate("Translate", "Rn", "1234", "UTF8_LCASE", "T1a2slate");
+ assertStringTranslate("Translate", "Rn", "1234", "UNICODE", "Tra2slate");
assertStringTranslate("Translate", "Rn", "1234", "UNICODE_CI", "T1a2slate");
+ assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_BINARY", "Tra2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_LCASE", "41a2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "1234", "UNICODE", "Tra2s3a4e");
assertStringTranslate("Translate", "Rnlt", "1234", "UNICODE_CI", "41a2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UTF8_BINARY", "TRaxsXaxe");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UTF8_LCASE", "xXaxsXaxe");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UNICODE", "TRaxsXaxe");
assertStringTranslate("TRanslate", "rnlt", "XxXx", "UNICODE_CI", "xXaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UTF8_BINARY", "TxaxsXaxeX");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UTF8_LCASE", "xxaxsXaxex");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UNICODE", "TxaxsXaxeX");
assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UNICODE_CI", "xxaxsXaxex");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UTF8_BINARY", "TXaxsXaxex");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UTF8_LCASE", "xXaxsXaxeX");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UNICODE", "TXaxsXaxex");
assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UNICODE_CI", "xXaxsXaxeX");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UTF8_BINARY", "test大千世AX大千世A");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UTF8_LCASE", "test大千世AB大千世A");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UNICODE", "test大千世AX大千世A");
assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UNICODE_CI", "test大千世AB大千世A");
+ assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UTF8_BINARY", "大千世界test大千世界");
+ assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UTF8_LCASE", "大千世界abca大千世界");
+ assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UNICODE", "大千世界test大千世界");
assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UNICODE_CI", "大千世界abca大千世界");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UTF8_BINARY", "Oeso大千世界大千世界");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UTF8_LCASE", "oeso大千世界大千世界");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UNICODE", "Oeso大千世界大千世界");
assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UNICODE_CI", "oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UTF8_BINARY", "大千世界大千世界oesO");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UTF8_LCASE", "大千世界大千世界OesO");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UNICODE", "大千世界大千世界oesO");
assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UNICODE_CI", "大千世界大千世界OesO");
+ assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UTF8_BINARY", "世世世界世世世界tesT");
+ assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UTF8_LCASE", "世世世界世世世界tesT");
+ assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UNICODE", "世世世界世世世界tesT");
assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UNICODE_CI", "世世世界世世世界tesT");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UTF8_BINARY", "Tr4234e");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UTF8_LCASE", "14234e");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UNICODE", "Tr4234e");
assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UNICODE_CI", "14234e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UTF8_BINARY", "Tra2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UTF8_LCASE", "41a2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UNICODE", "Tra2s3a4e");
assertStringTranslate("Translate", "Rnlt", "123495834634", "UNICODE_CI", "41a2s3a4e");
+ assertStringTranslate("abcdef", "abcde", "123", "UTF8_BINARY", "123f");
+ assertStringTranslate("abcdef", "abcde", "123", "UTF8_LCASE", "123f");
+ assertStringTranslate("abcdef", "abcde", "123", "UNICODE", "123f");
assertStringTranslate("abcdef", "abcde", "123", "UNICODE_CI", "123f");
-
- // One-to-many case mapping - UTF8_BINARY.
+ assertStringTranslate("abcdëÈêf", "ÊèË", "123", "AF_CI", "abcd321f");
+ // One-to-many case mapping (e.g. Turkish dotted I).
assertStringTranslate("İ", "i\u0307", "xy", "UTF8_BINARY", "İ");
- assertStringTranslate("i\u0307", "İ", "xy", "UTF8_BINARY", "i\u0307");
- assertStringTranslate("i\u030A", "İ", "x", "UTF8_BINARY", "i\u030A");
- assertStringTranslate("i\u030A", "İi", "xy", "UTF8_BINARY", "y\u030A");
- assertStringTranslate("İi\u0307", "İi\u0307", "123", "UTF8_BINARY", "123");
- assertStringTranslate("İi\u0307", "İyz", "123", "UTF8_BINARY", "1i\u0307");
- assertStringTranslate("İi\u0307", "xi\u0307", "123", "UTF8_BINARY", "İ23");
- assertStringTranslate("a\u030Abcå", "a\u030Aå", "123", "UTF8_BINARY", "12bc3");
- assertStringTranslate("a\u030Abcå", "A\u030AÅ", "123", "UTF8_BINARY", "a2bcå");
- assertStringTranslate("a\u030AβφδI\u0307", "Iİaå", "1234", "UTF8_BINARY", "3\u030Aβφδ1\u0307");
- // One-to-many case mapping - UTF8_LCASE.
assertStringTranslate("İ", "i\u0307", "xy", "UTF8_LCASE", "İ");
- assertStringTranslate("i\u0307", "İ", "xy", "UTF8_LCASE", "x");
- assertStringTranslate("i\u030A", "İ", "x", "UTF8_LCASE", "i\u030A");
- assertStringTranslate("i\u030A", "İi", "xy", "UTF8_LCASE", "y\u030A");
- assertStringTranslate("İi\u0307", "İi\u0307", "123", "UTF8_LCASE", "11");
- assertStringTranslate("İi\u0307", "İyz", "123", "UTF8_LCASE", "11");
- assertStringTranslate("İi\u0307", "xi\u0307", "123", "UTF8_LCASE", "İ23");
- assertStringTranslate("a\u030Abcå", "a\u030Aå", "123", "UTF8_LCASE", "12bc3");
- assertStringTranslate("a\u030Abcå", "A\u030AÅ", "123", "UTF8_LCASE", "12bc3");
- assertStringTranslate("A\u030Aβφδi\u0307", "Iİaå", "1234", "UTF8_LCASE", "3\u030Aβφδ2");
- // One-to-many case mapping - UNICODE.
assertStringTranslate("İ", "i\u0307", "xy", "UNICODE", "İ");
- assertStringTranslate("i\u0307", "İ", "xy", "UNICODE", "i\u0307");
- assertStringTranslate("i\u030A", "İ", "x", "UNICODE", "i\u030A");
- assertStringTranslate("i\u030A", "İi", "xy", "UNICODE", "i\u030A");
- assertStringTranslate("İi\u0307", "İi\u0307", "123", "UNICODE", "1i\u0307");
- assertStringTranslate("İi\u0307", "İyz", "123", "UNICODE", "1i\u0307");
- assertStringTranslate("İi\u0307", "xi\u0307", "123", "UNICODE", "İi\u0307");
- assertStringTranslate("a\u030Abcå", "a\u030Aå", "123", "UNICODE", "3bc3");
- assertStringTranslate("a\u030Abcå", "A\u030AÅ", "123", "UNICODE", "a\u030Abcå");
- assertStringTranslate("a\u030AβφδI\u0307", "Iİaå", "1234", "UNICODE", "4βφδ2");
- // One-to-many case mapping - UNICODE_CI.
assertStringTranslate("İ", "i\u0307", "xy", "UNICODE_CI", "İ");
+ assertStringTranslate("i\u0307", "İ", "xy", "UTF8_BINARY", "i\u0307");
+ assertStringTranslate("i\u0307", "İ", "xy", "UTF8_LCASE", "x");
+ assertStringTranslate("i\u0307", "İ", "xy", "UNICODE", "i\u0307");
assertStringTranslate("i\u0307", "İ", "xy", "UNICODE_CI", "x");
+ assertStringTranslate("i\u030A", "İ", "x", "UTF8_BINARY", "i\u030A");
+ assertStringTranslate("i\u030A", "İ", "x", "UTF8_LCASE", "i\u030A");
+ assertStringTranslate("i\u030A", "İ", "x", "UNICODE", "i\u030A");
assertStringTranslate("i\u030A", "İ", "x", "UNICODE_CI", "i\u030A");
+ assertStringTranslate("i\u030A", "İi", "xy", "UTF8_BINARY", "y\u030A");
+ assertStringTranslate("i\u030A", "İi", "xy", "UTF8_LCASE", "y\u030A");
+ assertStringTranslate("i\u030A", "İi", "xy", "UNICODE", "i\u030A");
assertStringTranslate("i\u030A", "İi", "xy", "UNICODE_CI", "i\u030A");
+ assertStringTranslate("İi\u0307", "İi\u0307", "123", "UTF8_BINARY", "123");
+ assertStringTranslate("İi\u0307", "İi\u0307", "123", "UTF8_LCASE", "11");
+ assertStringTranslate("İi\u0307", "İi\u0307", "123", "UNICODE", "1i\u0307");
assertStringTranslate("İi\u0307", "İi\u0307", "123", "UNICODE_CI", "11");
+ assertStringTranslate("İi\u0307", "İyz", "123", "UTF8_BINARY", "1i\u0307");
+ assertStringTranslate("İi\u0307", "İyz", "123", "UTF8_LCASE", "11");
+ assertStringTranslate("İi\u0307", "İyz", "123", "UNICODE", "1i\u0307");
assertStringTranslate("İi\u0307", "İyz", "123", "UNICODE_CI", "11");
+ assertStringTranslate("İi\u0307", "xi\u0307", "123", "UTF8_BINARY", "İ23");
+ assertStringTranslate("İi\u0307", "xi\u0307", "123", "UTF8_LCASE", "İ23");
+ assertStringTranslate("İi\u0307", "xi\u0307", "123", "UNICODE", "İi\u0307");
assertStringTranslate("İi\u0307", "xi\u0307", "123", "UNICODE_CI", "İi\u0307");
+ assertStringTranslate("a\u030Abcå", "a\u030Aå", "123", "UTF8_BINARY", "12bc3");
+ assertStringTranslate("a\u030Abcå", "a\u030Aå", "123", "UTF8_LCASE", "12bc3");
+ assertStringTranslate("a\u030Abcå", "a\u030Aå", "123", "UNICODE", "3bc3");
assertStringTranslate("a\u030Abcå", "a\u030Aå", "123", "UNICODE_CI", "3bc3");
+ assertStringTranslate("a\u030Abcå", "A\u030AÅ", "123", "UTF8_BINARY", "a2bcå");
+ assertStringTranslate("a\u030Abcå", "A\u030AÅ", "123", "UTF8_LCASE", "12bc3");
+ assertStringTranslate("a\u030Abcå", "A\u030AÅ", "123", "UNICODE", "a\u030Abcå");
assertStringTranslate("a\u030Abcå", "A\u030AÅ", "123", "UNICODE_CI", "3bc3");
+ assertStringTranslate("a\u030AβφδI\u0307", "Iİaå", "1234", "UTF8_BINARY", "3\u030Aβφδ1\u0307");
+ assertStringTranslate("A\u030Aβφδi\u0307", "Iİaå", "1234", "UTF8_LCASE", "3\u030Aβφδ2");
+ assertStringTranslate("a\u030AβφδI\u0307", "Iİaå", "1234", "UNICODE", "4βφδ2");
assertStringTranslate("A\u030Aβφδi\u0307", "Iİaå", "1234", "UNICODE_CI", "4βφδ2");
-
- // Greek sigmas - UTF8_BINARY.
+ // Conditional case mapping (e.g. Greek sigmas).
assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "σιι", "UTF8_BINARY", "σΥσΤΗΜΑΤΙΚΟσ");
- assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "σιι", "UTF8_BINARY", "ΣΥΣΤΗΜΑΤΙΚΟΣ");
- assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "σιι", "UTF8_BINARY", "ΣΥΣΤΗΜΑΤΙΚΟΣ");
- assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "ςιι", "UTF8_BINARY", "ΣΥΣΤΗΜΑΤΙΚΟΣ");
- assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "ςιι", "UTF8_BINARY", "ςΥςΤΗΜΑΤΙΚΟς");
- assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "ςιι", "UTF8_BINARY", "ΣΥΣΤΗΜΑΤΙΚΟΣ");
- assertStringTranslate("συστηματικος", "Συη", "σιι", "UTF8_BINARY", "σιστιματικος");
- assertStringTranslate("συστηματικος", "συη", "σιι", "UTF8_BINARY", "σιστιματικος");
- assertStringTranslate("συστηματικος", "ςυη", "σιι", "UTF8_BINARY", "σιστιματικοσ");
- // Greek sigmas - UTF8_LCASE.
assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "σιι", "UTF8_LCASE", "σισΤιΜΑΤΙΚΟσ");
- assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "σιι", "UTF8_LCASE", "σισΤιΜΑΤΙΚΟσ");
- assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "σιι", "UTF8_LCASE", "σισΤιΜΑΤΙΚΟσ");
- assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "ςιι", "UTF8_LCASE", "ςιςΤιΜΑΤΙΚΟς");
- assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "ςιι", "UTF8_LCASE", "ςιςΤιΜΑΤΙΚΟς");
- assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "ςιι", "UTF8_LCASE", "ςιςΤιΜΑΤΙΚΟς");
- assertStringTranslate("συστηματικος", "Συη", "σιι", "UTF8_LCASE", "σιστιματικοσ");
- assertStringTranslate("συστηματικος", "συη", "σιι", "UTF8_LCASE", "σιστιματικοσ");
- assertStringTranslate("συστηματικος", "ςυη", "σιι", "UTF8_LCASE", "σιστιματικοσ");
- // Greek sigmas - UNICODE.
assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "σιι", "UNICODE", "σΥσΤΗΜΑΤΙΚΟσ");
- assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "σιι", "UNICODE", "ΣΥΣΤΗΜΑΤΙΚΟΣ");
- assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "σιι", "UNICODE", "ΣΥΣΤΗΜΑΤΙΚΟΣ");
- assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "ςιι", "UNICODE", "ΣΥΣΤΗΜΑΤΙΚΟΣ");
- assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "ςιι", "UNICODE", "ςΥςΤΗΜΑΤΙΚΟς");
- assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "ςιι", "UNICODE", "ΣΥΣΤΗΜΑΤΙΚΟΣ");
- assertStringTranslate("συστηματικος", "Συη", "σιι", "UNICODE", "σιστιματικος");
- assertStringTranslate("συστηματικος", "συη", "σιι", "UNICODE", "σιστιματικος");
- assertStringTranslate("συστηματικος", "ςυη", "σιι", "UNICODE", "σιστιματικοσ");
- // Greek sigmas - UNICODE_CI.
assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "σιι", "UNICODE_CI", "σισΤιΜΑΤΙΚΟσ");
+ assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "σιι", "UTF8_BINARY", "ΣΥΣΤΗΜΑΤΙΚΟΣ");
+ assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "σιι", "UTF8_LCASE", "σισΤιΜΑΤΙΚΟσ");
+ assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "σιι", "UNICODE", "ΣΥΣΤΗΜΑΤΙΚΟΣ");
assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "σιι", "UNICODE_CI", "σισΤιΜΑΤΙΚΟσ");
+ assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "σιι", "UTF8_BINARY", "ΣΥΣΤΗΜΑΤΙΚΟΣ");
+ assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "σιι", "UTF8_LCASE", "σισΤιΜΑΤΙΚΟσ");
+ assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "σιι", "UNICODE", "ΣΥΣΤΗΜΑΤΙΚΟΣ");
assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "σιι", "UNICODE_CI", "σισΤιΜΑΤΙΚΟσ");
+ assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "ςιι", "UTF8_BINARY", "ΣΥΣΤΗΜΑΤΙΚΟΣ");
+ assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "ςιι", "UTF8_LCASE", "ςιςΤιΜΑΤΙΚΟς");
+ assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "ςιι", "UNICODE", "ΣΥΣΤΗΜΑΤΙΚΟΣ");
assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "ςιι", "UNICODE_CI", "ςιςΤιΜΑΤΙΚΟς");
+ assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "ςιι", "UTF8_BINARY", "ςΥςΤΗΜΑΤΙΚΟς");
+ assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "ςιι", "UTF8_LCASE", "ςιςΤιΜΑΤΙΚΟς");
+ assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "ςιι", "UNICODE", "ςΥςΤΗΜΑΤΙΚΟς");
assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "ςιι", "UNICODE_CI", "ςιςΤιΜΑΤΙΚΟς");
+ assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "ςιι", "UTF8_BINARY", "ΣΥΣΤΗΜΑΤΙΚΟΣ");
+ assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "ςιι", "UTF8_LCASE", "ςιςΤιΜΑΤΙΚΟς");
+ assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "ςιι", "UNICODE", "ΣΥΣΤΗΜΑΤΙΚΟΣ");
assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "ςιι", "UNICODE_CI", "ςιςΤιΜΑΤΙΚΟς");
+ assertStringTranslate("συστηματικος", "Συη", "σιι", "UTF8_BINARY", "σιστιματικος");
+ assertStringTranslate("συστηματικος", "Συη", "σιι", "UTF8_LCASE", "σιστιματικοσ");
+ assertStringTranslate("συστηματικος", "Συη", "σιι", "UNICODE", "σιστιματικος");
assertStringTranslate("συστηματικος", "Συη", "σιι", "UNICODE_CI", "σιστιματικοσ");
+ assertStringTranslate("συστηματικος", "συη", "σιι", "UTF8_BINARY", "σιστιματικος");
+ assertStringTranslate("συστηματικος", "συη", "σιι", "UTF8_LCASE", "σιστιματικοσ");
+ assertStringTranslate("συστηματικος", "συη", "σιι", "UNICODE", "σιστιματικος");
assertStringTranslate("συστηματικος", "συη", "σιι", "UNICODE_CI", "σιστιματικοσ");
+ assertStringTranslate("συστηματικος", "ςυη", "σιι", "UTF8_BINARY", "σιστιματικοσ");
+ assertStringTranslate("συστηματικος", "ςυη", "σιι", "UTF8_LCASE", "σιστιματικοσ");
+ assertStringTranslate("συστηματικος", "ςυη", "σιι", "UNICODE", "σιστιματικοσ");
assertStringTranslate("συστηματικος", "ςυη", "σιι", "UNICODE_CI", "σιστιματικοσ");
+ // Surrogate pairs.
+ assertStringTranslate("a🙃b🙃c", "a", "x", "UTF8_BINARY", "x🙃b🙃c");
+ assertStringTranslate("a🙃b🙃c", "a🙃", "xy", "UTF8_BINARY", "xybyc");
+ assertStringTranslate("a🙃b🙃c", "a🙃b", "xyz", "UTF8_BINARY", "xyzyc");
+ assertStringTranslate("a🙃b🙃c", "a🙃bc", "xyzw", "UTF8_BINARY", "xyzyw");
+ assertStringTranslate("😀😆😃😄", "😄😆", "😅😂", "UTF8_BINARY", "😀😂😃😅");
+ assertStringTranslate("😀😆😃😄", "😄😆", "😅😂", "UTF8_LCASE", "😀😂😃😅");
+ assertStringTranslate("😀😆😃😄", "😄😆", "😅😂", "UNICODE", "😀😂😃😅");
+ assertStringTranslate("😀😆😃😄", "😄😆", "😅😂", "UNICODE_CI", "😀😂😃😅");
+ assertStringTranslate("𐐅", "𐐅", "x", "UTF8_BINARY", "x");
+ assertStringTranslate("𐐅", "𐐅", "x", "UTF8_LCASE", "x");
+ assertStringTranslate("𐐅", "𐐅", "x", "UNICODE", "x");
+ assertStringTranslate("𐐅", "𐐅", "x", "UNICODE_CI", "x");
+ assertStringTranslate("𐐅", "𐐭", "x", "UTF8_BINARY", "𐐅");
+ assertStringTranslate("𐐅", "𐐭", "x", "UTF8_LCASE", "x");
+ assertStringTranslate("𐐅", "𐐭", "x", "UNICODE", "𐐅");
+ assertStringTranslate("𐐅", "𐐭", "x", "UNICODE_CI", "x");
+ assertStringTranslate("A", "A", "𐐅", "UTF8_BINARY", "𐐅");
+ assertStringTranslate("A", "A", "𐐅", "UTF8_LCASE", "𐐅");
+ assertStringTranslate("A", "A", "𐐅", "UNICODE", "𐐅");
+ assertStringTranslate("A", "A", "𐐅", "UNICODE_CI", "𐐅");
+ assertStringTranslate("A", "a", "𐐅", "UTF8_BINARY", "A");
+ assertStringTranslate("A", "a", "𐐅", "UTF8_LCASE", "𐐅");
+ assertStringTranslate("A", "a", "𐐅", "UNICODE", "A");
+ assertStringTranslate("A", "a", "𐐅", "UNICODE_CI", "𐐅");
+ assertStringTranslate("a", "A", "𐐅", "UTF8_BINARY", "a");
+ assertStringTranslate("a", "A", "𐐅", "UTF8_LCASE", "𐐅");
+ assertStringTranslate("a", "A", "𐐅", "UNICODE", "a");
+ assertStringTranslate("a", "A", "𐐅", "UNICODE_CI", "𐐅");
+ assertStringTranslate("𝔸", "𝔸", "x", "UTF8_BINARY", "x");
+ assertStringTranslate("𝔸", "𝔸", "x", "UTF8_LCASE", "x");
+ assertStringTranslate("𝔸", "𝔸", "x", "UNICODE", "x");
+ assertStringTranslate("𝔸", "𝔸", "x", "UNICODE_CI", "x");
+ assertStringTranslate("𝔸", "𝕒", "x", "UTF8_BINARY", "𝔸");
+ assertStringTranslate("𝔸", "𝕒", "x", "UTF8_LCASE", "𝔸");
+ assertStringTranslate("𝔸", "𝕒", "x", "UNICODE", "𝔸");
+ assertStringTranslate("𝔸", "𝕒", "x", "UNICODE_CI", "x");
}
private Map buildDict(String matching, String replace) {
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 2428d40fe8016..c4a66fdffdd4d 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -26,6 +26,8 @@
import com.google.common.collect.ImmutableMap;
import org.apache.spark.unsafe.Platform;
+import org.apache.spark.unsafe.UTF8StringBuilder;
+
import org.junit.jupiter.api.Test;
import static org.apache.spark.unsafe.types.UTF8String.fromString;
@@ -1362,4 +1364,27 @@ public void toBinaryString() {
UTF8String.fromString("111111111111111111111111111111111111111111111111111111111111111"),
UTF8String.toBinaryString(Long.MAX_VALUE));
}
+
+ /**
+ * This tests whether appending a codepoint to a 'UTF8StringBuilder' correctly appends every
+ * single codepoint. We test it against an already existing 'StringBuilder.appendCodePoint' and
+ * 'UTF8String.fromString'. We skip testing the surrogate codepoints because at some point while
+ * converting the surrogate codepoint to 'UTF8String' (via 'StringBuilder' and 'UTF8String') we
+ * get an ill-formated byte sequence (probably because 'String' is in UTF-16 format, and a single
+ * surrogate codepoint is handled differently in UTF-16 than in UTF-8, so somewhere during those
+ * conversions some different behaviour happens).
+ */
+ @Test
+ public void testAppendCodepointToUTF8StringBuilder() {
+ int surrogateRangeLowerBound = 0xD800;
+ int surrogateRangeUpperBound = 0xDFFF;
+ for (int i = Character.MIN_CODE_POINT; i <= Character.MAX_CODE_POINT; ++i) {
+ if(surrogateRangeLowerBound <= i && i <= surrogateRangeUpperBound) continue;
+ UTF8StringBuilder usb = new UTF8StringBuilder();
+ usb.appendCodePoint(i);
+ StringBuilder sb = new StringBuilder();
+ sb.appendCodePoint(i);
+ assert(usb.build().equals(UTF8String.fromString(sb.toString())));
+ }
+ }
}
diff --git a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala
index 23dae47f6ff2c..1f64547da7415 100644
--- a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala
+++ b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala
@@ -32,28 +32,35 @@ import org.apache.spark.sql.catalyst.util.CollationFactory._
import org.apache.spark.unsafe.types.UTF8String.{fromString => toUTF8}
class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ignore funsuite
+
+ val currentIcuVersion: String = "76.1"
+
test("collationId stability") {
assert(INDETERMINATE_COLLATION_ID == -1)
assert(UTF8_BINARY_COLLATION_ID == 0)
val utf8Binary = fetchCollation(UTF8_BINARY_COLLATION_ID)
assert(utf8Binary.collationName == "UTF8_BINARY")
- assert(utf8Binary.supportsBinaryEquality)
+ assert(utf8Binary.isUtf8BinaryType)
+ assert(utf8Binary.version == currentIcuVersion)
assert(UTF8_LCASE_COLLATION_ID == 1)
- val utf8BinaryLcase = fetchCollation(UTF8_LCASE_COLLATION_ID)
- assert(utf8BinaryLcase.collationName == "UTF8_LCASE")
- assert(!utf8BinaryLcase.supportsBinaryEquality)
+ val utf8Lcase = fetchCollation(UTF8_LCASE_COLLATION_ID)
+ assert(utf8Lcase.collationName == "UTF8_LCASE")
+ assert(!utf8Lcase.isUtf8BinaryType)
+ assert(utf8Lcase.version == currentIcuVersion)
assert(UNICODE_COLLATION_ID == (1 << 29))
val unicode = fetchCollation(UNICODE_COLLATION_ID)
assert(unicode.collationName == "UNICODE")
- assert(!unicode.supportsBinaryEquality)
+ assert(!unicode.isUtf8BinaryType)
+ assert(unicode.version == currentIcuVersion)
assert(UNICODE_CI_COLLATION_ID == ((1 << 29) | (1 << 17)))
val unicodeCi = fetchCollation(UNICODE_CI_COLLATION_ID)
assert(unicodeCi.collationName == "UNICODE_CI")
- assert(!unicodeCi.supportsBinaryEquality)
+ assert(!unicodeCi.isUtf8BinaryType)
+ assert(unicodeCi.version == currentIcuVersion)
}
test("UTF8_BINARY and ICU root locale collation names") {
@@ -93,27 +100,33 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig
Seq(
("UTF8_BINARY_CS", "UTF8_BINARY"),
("UTF8_BINARY_AS", "UTF8_BINARY"), // this should be UNICODE_AS
- ("UTF8_BINARY_CS_AS","UTF8_BINARY"), // this should be UNICODE_CS_AS
- ("UTF8_BINARY_AS_CS","UTF8_BINARY"),
- ("UTF8_BINARY_CI","UTF8_BINARY"),
- ("UTF8_BINARY_AI","UTF8_BINARY"),
- ("UTF8_BINARY_CI_AI","UTF8_BINARY"),
- ("UTF8_BINARY_AI_CI","UTF8_BINARY"),
- ("UTF8_BS","UTF8_LCASE"),
- ("BINARY_UTF8","ar_SAU"),
- ("UTF8_BINARY_A","UTF8_BINARY"),
- ("UNICODE_X","UNICODE"),
- ("UNICODE_CI_X","UNICODE"),
- ("UNICODE_LCASE_X","UNICODE"),
- ("UTF8_UNICODE","UTF8_LCASE"),
- ("UTF8_BINARY_UNICODE","UTF8_BINARY"),
+ ("UTF8_BINARY_CS_AS", "UTF8_BINARY"), // this should be UNICODE_CS_AS
+ ("UTF8_BINARY_AS_CS", "UTF8_BINARY"),
+ ("UTF8_BINARY_CI", "UTF8_BINARY"),
+ ("UTF8_BINARY_AI", "UTF8_BINARY"),
+ ("UTF8_BINARY_CI_AI", "UTF8_BINARY"),
+ ("UTF8_BINARY_AI_CI", "UTF8_BINARY"),
+ ("UTF8_BINARY_AI_RTRIM", "UTF8_BINARY_RTRIM"),
+ ("UTF8_BINARY_CI_RTRIM", "UTF8_BINARY_RTRIM"),
+ ("UTF8_BINARY_AI_CI_RTRIM", "UTF8_BINARY_RTRIM"),
+ ("UTF8_BS", "UTF8_LCASE"),
+ ("BINARY_UTF8", "ar_SAU"),
+ ("UTF8_BINARY_A", "UTF8_BINARY"),
+ ("UNICODE_X", "UNICODE"),
+ ("UNICODE_CI_X", "UNICODE"),
+ ("UNICODE_LCASE_X", "UNICODE"),
+ ("UNICODE_RTRIM_LCASE_X", "UNICODE"),
+ ("UTF8_UNICODE", "UTF8_LCASE"),
+ ("UTF8_BINARY_UNICODE", "UTF8_BINARY"),
("CI_UNICODE", "UNICODE"),
("LCASE_UNICODE", "UNICODE"),
+ ("RTRIM_UNICODE", "UNICODE"),
("UNICODE_UNSPECIFIED", "UNICODE"),
("UNICODE_CI_UNSPECIFIED", "UNICODE"),
("UNICODE_UNSPECIFIED_CI_UNSPECIFIED", "UNICODE"),
("UNICODE_INDETERMINATE", "UNICODE"),
- ("UNICODE_CI_INDETERMINATE", "UNICODE")
+ ("UNICODE_CI_INDETERMINATE", "UNICODE"),
+ ("UNICODE_RTRIM_INDETERMINATE", "UNICODE")
).foreach{case (collationName, proposals) =>
checkCollationNameError(collationName, proposals)
}
@@ -127,6 +140,11 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig
CollationTestCase("UTF8_BINARY", "aaa", "AAA", false),
CollationTestCase("UTF8_BINARY", "aaa", "bbb", false),
CollationTestCase("UTF8_BINARY", "å", "a\u030A", false),
+ CollationTestCase("UTF8_BINARY_RTRIM", "aaa", "aaa", true),
+ CollationTestCase("UTF8_BINARY_RTRIM", "aaa", "aaa ", true),
+ CollationTestCase("UTF8_BINARY_RTRIM", "aaa ", "aaa ", true),
+ CollationTestCase("UTF8_BINARY_RTRIM", "aaa", " aaa ", false),
+ CollationTestCase("UTF8_BINARY_RTRIM", " ", " ", true),
CollationTestCase("UTF8_LCASE", "aaa", "aaa", true),
CollationTestCase("UTF8_LCASE", "aaa", "AAA", true),
CollationTestCase("UTF8_LCASE", "aaa", "AaA", true),
@@ -134,15 +152,36 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig
CollationTestCase("UTF8_LCASE", "aaa", "aa", false),
CollationTestCase("UTF8_LCASE", "aaa", "bbb", false),
CollationTestCase("UTF8_LCASE", "å", "a\u030A", false),
+ CollationTestCase("UTF8_LCASE_RTRIM", "aaa", "AaA", true),
+ CollationTestCase("UTF8_LCASE_RTRIM", "aaa", "AaA ", true),
+ CollationTestCase("UTF8_LCASE_RTRIM", "aaa ", "AaA ", true),
+ CollationTestCase("UTF8_LCASE_RTRIM", "aaa", " AaA ", false),
+ CollationTestCase("UTF8_LCASE_RTRIM", " ", " ", true),
CollationTestCase("UNICODE", "aaa", "aaa", true),
CollationTestCase("UNICODE", "aaa", "AAA", false),
CollationTestCase("UNICODE", "aaa", "bbb", false),
CollationTestCase("UNICODE", "å", "a\u030A", true),
+ CollationTestCase("UNICODE_RTRIM", "aaa", "aaa", true),
+ CollationTestCase("UNICODE_RTRIM", "aaa", "aaa ", true),
+ CollationTestCase("UNICODE_RTRIM", "aaa ", "aaa ", true),
+ CollationTestCase("UNICODE_RTRIM", "aaa", " aaa ", false),
+ CollationTestCase("UNICODE_RTRIM", " ", " ", true),
CollationTestCase("UNICODE_CI", "aaa", "aaa", true),
CollationTestCase("UNICODE_CI", "aaa", "AAA", true),
CollationTestCase("UNICODE_CI", "aaa", "bbb", false),
CollationTestCase("UNICODE_CI", "å", "a\u030A", true),
- CollationTestCase("UNICODE_CI", "Å", "a\u030A", true)
+ CollationTestCase("UNICODE_CI", "Å", "a\u030A", true),
+ CollationTestCase("UNICODE_CI_RTRIM", "aaa", "AaA", true),
+ CollationTestCase("UNICODE_CI_RTRIM", "aaa", "AaA ", true),
+ CollationTestCase("UNICODE_CI_RTRIM", "aaa ", "AaA ", true),
+ CollationTestCase("UNICODE_CI_RTRIM", "aaa", " AaA ", false),
+ CollationTestCase("UNICODE_RTRIM", " ", " ", true),
+ CollationTestCase("SR_CI", "cČć", "CčĆ", true),
+ CollationTestCase("SR_CI", "cCc", "CčĆ", false),
+ CollationTestCase("SR_CI_AI", "cCc", "CčĆ", true),
+ CollationTestCase("sr_Cyrl_CI", "цЧћ", "ЦчЋ", true),
+ CollationTestCase("sr_Cyrl_CI", "цЦц", "ЦчЋ", false),
+ CollationTestCase("sr_Cyrl_CI_AI", "цЦц", "ЦчЋ", false)
)
checks.foreach(testCase => {
@@ -162,19 +201,50 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig
CollationTestCase("UTF8_BINARY", "aaa", "AAA", 1),
CollationTestCase("UTF8_BINARY", "aaa", "bbb", -1),
CollationTestCase("UTF8_BINARY", "aaa", "BBB", 1),
+ CollationTestCase("UTF8_BINARY_RTRIM", "aaa ", "aaa", 0),
+ CollationTestCase("UTF8_BINARY_RTRIM", "aaa ", "aaa ", 0),
+ CollationTestCase("UTF8_BINARY_RTRIM", "aaa ", "bbb", -1),
+ CollationTestCase("UTF8_BINARY_RTRIM", "aaa ", "bbb ", -1),
+ CollationTestCase("UTF8_BINARY_RTRIM", "aaa", "BBB" , 1),
+ CollationTestCase("UTF8_BINARY_RTRIM", "aaa ", "BBB " , 1),
+ CollationTestCase("UTF8_BINARY_RTRIM", " ", " " , 0),
CollationTestCase("UTF8_LCASE", "aaa", "aaa", 0),
CollationTestCase("UTF8_LCASE", "aaa", "AAA", 0),
CollationTestCase("UTF8_LCASE", "aaa", "AaA", 0),
CollationTestCase("UTF8_LCASE", "aaa", "AaA", 0),
CollationTestCase("UTF8_LCASE", "aaa", "aa", 1),
CollationTestCase("UTF8_LCASE", "aaa", "bbb", -1),
+ CollationTestCase("UTF8_LCASE_RTRIM", "aaa ", "AAA", 0),
+ CollationTestCase("UTF8_LCASE_RTRIM", "aaa ", "AAA ", 0),
+ CollationTestCase("UTF8_LCASE_RTRIM", "aaa", "bbb ", -1),
+ CollationTestCase("UTF8_LCASE_RTRIM", "aaa ", "bbb ", -1),
+ CollationTestCase("UTF8_LCASE_RTRIM", "aaa ", "aa", 1),
+ CollationTestCase("UTF8_LCASE_RTRIM", "aaa ", "aa ", 1),
+ CollationTestCase("UTF8_LCASE_RTRIM", " ", " ", 0),
CollationTestCase("UNICODE", "aaa", "aaa", 0),
CollationTestCase("UNICODE", "aaa", "AAA", -1),
CollationTestCase("UNICODE", "aaa", "bbb", -1),
CollationTestCase("UNICODE", "aaa", "BBB", -1),
+ CollationTestCase("UNICODE_RTRIM", "aaa ", "aaa", 0),
+ CollationTestCase("UNICODE_RTRIM", "aaa ", "aaa ", 0),
+ CollationTestCase("UNICODE_RTRIM", "aaa ", "bbb", -1),
+ CollationTestCase("UNICODE_RTRIM", "aaa ", "bbb ", -1),
+ CollationTestCase("UNICODE_RTRIM", "aaa", "BBB" , -1),
+ CollationTestCase("UNICODE_RTRIM", "aaa ", "BBB " , -1),
+ CollationTestCase("UNICODE_RTRIM", " ", " ", 0),
CollationTestCase("UNICODE_CI", "aaa", "aaa", 0),
CollationTestCase("UNICODE_CI", "aaa", "AAA", 0),
- CollationTestCase("UNICODE_CI", "aaa", "bbb", -1))
+ CollationTestCase("UNICODE_CI", "aaa", "bbb", -1),
+ CollationTestCase("UNICODE_CI_RTRIM", "aaa ", "AAA", 0),
+ CollationTestCase("UNICODE_CI_RTRIM", "aaa ", "AAA ", 0),
+ CollationTestCase("UNICODE_CI_RTRIM", "aaa", "bbb ", -1),
+ CollationTestCase("UNICODE_CI_RTRIM", "aaa ", "bbb ", -1),
+ CollationTestCase("UNICODE_CI_RTRIM", "aaa ", "aa", 1),
+ CollationTestCase("UNICODE_CI_RTRIM", "aaa ", "aa ", 1),
+ CollationTestCase("UNICODE_CI_RTRIM", " ", " ", 0),
+ CollationTestCase("SR_CI_AI", "cČć", "ČćC", 0),
+ CollationTestCase("SR_CI", "cČć", "ČćC", -1)
+ )
checks.foreach(testCase => {
val collation = fetchCollation(testCase.collationName)
@@ -192,7 +262,10 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig
CollationTestCase("UNICODE_CI", "abcde", "abcde", 5),
CollationTestCase("UNICODE_CI", "abcde", "ABCDE", 5),
CollationTestCase("UNICODE_CI", "abcde", "fgh", 0),
- CollationTestCase("UNICODE_CI", "abcde", "FGH", 0)
+ CollationTestCase("UNICODE_CI", "abcde", "FGH", 0),
+ CollationTestCase("SR_CI_AI", "abcčċ", "CCC", 3),
+ CollationTestCase("SR_CI", "abcčċ", "C", 1),
+ CollationTestCase("SR", "abcčċ", "CCC", 0)
)
checks.foreach(testCase => {
@@ -229,7 +302,9 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig
"UNICODE_CI",
"UNICODE_AI",
"UNICODE_CI_AI",
- "UNICODE_AI_CI"
+ "UNICODE_AI_CI",
+ "DE_CI_AI",
+ "MT_CI"
).foreach(collationId => {
val col1 = fetchCollation(collationId)
val col2 = fetchCollation(collationId)
@@ -303,15 +378,23 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig
("CI_en", "ceb"),
("USA_CI_en", "UNICODE"),
("en_CI_USA", "en_USA"),
+ ("en_RTRIM_USA", "en_USA"),
("CI_sr_Cyrl_SRB", "sr_Cyrl_SRB"),
+ ("RTRIM_sr_Cyrl_SRB", "sr_Cyrl_SRB"),
("sr_CI_Cyrl_SRB", "sr_Cyrl_SRB"),
+ ("sr_RTRIM_Cyrl_SRB", "sr_Cyrl_SRB"),
("sr_Cyrl_CI_SRB", "sr_Cyrl_SRB"),
+ ("sr_Cyrl_RTRIM_SRB", "sr_Cyrl_SRB"),
("CI_Cyrl_sr", "sr_Cyrl_SRB"),
+ ("RTRIM_Cyrl_sr", "sr_Cyrl_SRB"),
("Cyrl_CI_sr", "he_ISR"),
("Cyrl_CI_sr_SRB", "sr_Cyrl_SRB"),
+ ("Cyrl_RTRIM_sr_SRB", "sr_Cyrl_SRB"),
("Cyrl_sr_CI_SRB", "sr_Cyrl_SRB"),
+ ("Cyrl_sr_RTRIM_SRB", "sr_Cyrl_SRB"),
// no locale specified
("_CI_AI", "af_CI_AI, am_CI_AI, ar_CI_AI"),
+ ("_CI_AI_RTRIM", "af_CI_AI_RTRIM, am_CI_AI_RTRIM, ar_CI_AI_RTRIM"),
("", "af, am, ar")
).foreach { case (collationName, proposals) =>
checkCollationNameError(collationName, proposals)
@@ -369,9 +452,9 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig
1 << 15, // UTF8_BINARY mandatory zero bit 15 breach.
1 << 16, // UTF8_BINARY mandatory zero bit 16 breach.
1 << 17, // UTF8_BINARY mandatory zero bit 17 breach.
- 1 << 18, // UTF8_BINARY mandatory zero bit 18 breach.
1 << 19, // UTF8_BINARY mandatory zero bit 19 breach.
1 << 20, // UTF8_BINARY mandatory zero bit 20 breach.
+ 1 << 21, // UTF8_BINARY mandatory zero bit 21 breach.
1 << 23, // UTF8_BINARY mandatory zero bit 23 breach.
1 << 24, // UTF8_BINARY mandatory zero bit 24 breach.
1 << 25, // UTF8_BINARY mandatory zero bit 25 breach.
@@ -382,7 +465,6 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig
(1 << 29) | (1 << 13), // ICU mandatory zero bit 13 breach.
(1 << 29) | (1 << 14), // ICU mandatory zero bit 14 breach.
(1 << 29) | (1 << 15), // ICU mandatory zero bit 15 breach.
- (1 << 29) | (1 << 18), // ICU mandatory zero bit 18 breach.
(1 << 29) | (1 << 19), // ICU mandatory zero bit 19 breach.
(1 << 29) | (1 << 20), // ICU mandatory zero bit 20 breach.
(1 << 29) | (1 << 21), // ICU mandatory zero bit 21 breach.
@@ -408,6 +490,7 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig
("UNICODE_CI_CI", "UNICODE_CI"),
("UNICODE_CI_CS", "UNICODE_CS"),
("UNICODE_CS_CI", "UNICODE_CS"),
+ ("UNICODE_RTRIM_RTRIM", "UNICODE_RTRIM"),
("UNICODE_AS_AS", "UNICODE_AS"),
("UNICODE_AI_AI", "UNICODE_AI"),
("UNICODE_AS_AI", "UNICODE_AS"),
@@ -417,6 +500,7 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig
("UNICODE_CS_AS_CI_AI", "UNICODE_CS_AS"),
("UNICODE__CS__AS", "UNICODE_AS"),
("UNICODE-CS-AS", "UNICODE"),
+ ("UNICODE__CS__RTRIM", "UNICODE_RTRIM"),
("UNICODECSAS", "UNICODE"),
("_CS_AS_UNICODE", "UNICODE")
).foreach { case (collationName, proposals) =>
@@ -457,7 +541,7 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig
val e = intercept[SparkException] {
fetchCollation(collationName)
}
- assert(e.getErrorClass === "COLLATION_INVALID_NAME")
+ assert(e.getCondition === "COLLATION_INVALID_NAME")
assert(e.getMessageParameters.asScala === Map(
"collationName" -> collationName, "proposals" -> proposals))
}
diff --git a/common/utils/src/main/java/org/apache/spark/SparkThrowable.java b/common/utils/src/main/java/org/apache/spark/SparkThrowable.java
index e1235b2982ba0..39808f58b08ae 100644
--- a/common/utils/src/main/java/org/apache/spark/SparkThrowable.java
+++ b/common/utils/src/main/java/org/apache/spark/SparkThrowable.java
@@ -35,19 +35,29 @@
*/
@Evolving
public interface SparkThrowable {
- // Succinct, human-readable, unique, and consistent representation of the error category
- // If null, error class is not set
- String getErrorClass();
+ /**
+ * Succinct, human-readable, unique, and consistent representation of the error condition.
+ * If null, error condition is not set.
+ */
+ String getCondition();
+
+ /**
+ * Succinct, human-readable, unique, and consistent representation of the error category.
+ * If null, error class is not set.
+ * @deprecated Use {@link #getCondition()} instead.
+ */
+ @Deprecated
+ default String getErrorClass() { return getCondition(); }
// Portable error identifier across SQL engines
// If null, error class or SQLSTATE is not set
default String getSqlState() {
- return SparkThrowableHelper.getSqlState(this.getErrorClass());
+ return SparkThrowableHelper.getSqlState(this.getCondition());
}
// True if this error is an internal error.
default boolean isInternalError() {
- return SparkThrowableHelper.isInternalError(this.getErrorClass());
+ return SparkThrowableHelper.isInternalError(this.getCondition());
}
default Map getMessageParameters() {
diff --git a/common/utils/src/main/scala/org/apache/spark/unsafe/array/ByteArrayUtils.java b/common/utils/src/main/java/org/apache/spark/unsafe/array/ByteArrayUtils.java
similarity index 100%
rename from common/utils/src/main/scala/org/apache/spark/unsafe/array/ByteArrayUtils.java
rename to common/utils/src/main/java/org/apache/spark/unsafe/array/ByteArrayUtils.java
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
index d8edc89ba83ea..77437f6c56179 100644
--- a/common/utils/src/main/resources/error/error-conditions.json
+++ b/common/utils/src/main/resources/error/error-conditions.json
@@ -1,4 +1,10 @@
{
+ "ADD_DEFAULT_UNSUPPORTED" : {
+ "message" : [
+ "Failed to execute command because DEFAULT values are not supported when adding new columns to previously existing target data source with table provider: \"\"."
+ ],
+ "sqlState" : "42623"
+ },
"AGGREGATE_FUNCTION_WITH_NONDETERMINISTIC_EXPRESSION" : {
"message" : [
"Non-deterministic expression should not appear in the arguments of an aggregate function."
@@ -121,10 +127,16 @@
},
"BINARY_ARITHMETIC_OVERFLOW" : {
"message" : [
- " caused overflow."
+ " caused overflow. Use to ignore overflow problem and return NULL."
],
"sqlState" : "22003"
},
+ "BOOLEAN_STATEMENT_WITH_EMPTY_ROW" : {
+ "message" : [
+ "Boolean statement is invalid. Expected single row with a value of the BOOLEAN type, but got an empty row."
+ ],
+ "sqlState" : "21000"
+ },
"CALL_ON_STREAMING_DATASET_UNSUPPORTED" : {
"message" : [
"The method can not be called on streaming Dataset/DataFrame."
@@ -256,6 +268,26 @@
"Error reading streaming state file of does not exist. If the stream job is restarted with a new or updated state operation, please create a new checkpoint location or clear the existing checkpoint location."
]
},
+ "HDFS_STORE_PROVIDER_OUT_OF_MEMORY" : {
+ "message" : [
+ "Could not load HDFS state store with id because of an out of memory exception."
+ ]
+ },
+ "INVALID_CHANGE_LOG_READER_VERSION" : {
+ "message" : [
+ "The change log reader version cannot be ."
+ ]
+ },
+ "INVALID_CHANGE_LOG_WRITER_VERSION" : {
+ "message" : [
+ "The change log writer version cannot be ."
+ ]
+ },
+ "ROCKSDB_STORE_PROVIDER_OUT_OF_MEMORY" : {
+ "message" : [
+ "Could not load RocksDB state store with id because of an out of memory exception."
+ ]
+ },
"SNAPSHOT_PARTITION_ID_NOT_FOUND" : {
"message" : [
"Partition id not found for state of operator at ."
@@ -344,6 +376,12 @@
],
"sqlState" : "429BB"
},
+ "CANNOT_REMOVE_RESERVED_PROPERTY" : {
+ "message" : [
+ "Cannot remove reserved property: ."
+ ],
+ "sqlState" : "42000"
+ },
"CANNOT_RENAME_ACROSS_SCHEMA" : {
"message" : [
"Renaming a across schemas is not allowed."
@@ -368,12 +406,6 @@
],
"sqlState" : "58030"
},
- "CANNOT_SAVE_VARIANT" : {
- "message" : [
- "Cannot save variant data type into external storage."
- ],
- "sqlState" : "0A000"
- },
"CANNOT_UPDATE_FIELD" : {
"message" : [
"Cannot update
field type:"
@@ -414,6 +446,12 @@
],
"sqlState" : "42846"
},
+ "CANNOT_USE_KRYO" : {
+ "message" : [
+ "Cannot load Kryo serialization codec. Kryo serialization cannot be used in the Spark Connect client. Use Java serialization, provide a custom Codec, or use Spark Classic instead."
+ ],
+ "sqlState" : "22KD3"
+ },
"CANNOT_WRITE_STATE_STORE" : {
"message" : [
"Error writing state store files for provider ."
@@ -429,13 +467,13 @@
},
"CAST_INVALID_INPUT" : {
"message" : [
- "The value of the type cannot be cast to because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. If necessary set to \"false\" to bypass this error."
+ "The value of the type cannot be cast to because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead."
],
"sqlState" : "22018"
},
"CAST_OVERFLOW" : {
"message" : [
- "The value of the type cannot be cast to due to an overflow. Use `try_cast` to tolerate overflow and return NULL instead. If necessary set to \"false\" to bypass this error."
+ "The value of the type cannot be cast to due to an overflow. Use `try_cast` to tolerate overflow and return NULL instead."
],
"sqlState" : "22003"
},
@@ -459,6 +497,12 @@
],
"sqlState" : "56000"
},
+ "CIRCULAR_CLASS_REFERENCE" : {
+ "message" : [
+ "Cannot have circular references in class, but got the circular reference of class ."
+ ],
+ "sqlState" : "42602"
+ },
"CLASS_NOT_OVERRIDE_EXPECTED_METHOD" : {
"message" : [
" must override either or ."
@@ -471,6 +515,20 @@
],
"sqlState" : "0A000"
},
+ "CLUSTERING_COLUMNS_MISMATCH" : {
+ "message" : [
+ "Specified clustering does not match that of the existing table .",
+ "Specified clustering columns: [].",
+ "Existing clustering columns: []."
+ ],
+ "sqlState" : "42P10"
+ },
+ "CLUSTERING_NOT_SUPPORTED" : {
+ "message" : [
+ "'' does not support clustering."
+ ],
+ "sqlState" : "42000"
+ },
"CODEC_NOT_AVAILABLE" : {
"message" : [
"The codec is not available."
@@ -519,7 +577,7 @@
},
"IMPLICIT" : {
"message" : [
- "Error occurred due to the mismatch between multiple implicit non-default collations. Use COLLATE function to set the collation explicitly."
+ "Error occurred due to the mismatch between implicit collations: []. Use COLLATE function to set the collation explicitly."
]
}
},
@@ -560,6 +618,12 @@
],
"sqlState" : "42711"
},
+ "COLUMN_ARRAY_ELEMENT_TYPE_MISMATCH" : {
+ "message" : [
+ "Some values in field are incompatible with the column array type. Expected type ."
+ ],
+ "sqlState" : "0A000"
+ },
"COLUMN_NOT_DEFINED_IN_TABLE" : {
"message" : [
" column is not defined in table , defined table columns are: ."
@@ -572,6 +636,13 @@
],
"sqlState" : "42703"
},
+ "COLUMN_ORDINAL_OUT_OF_BOUNDS" : {
+ "message" : [
+ "Column ordinal out of bounds. The number of columns in the table is , but the column ordinal is .",
+ "Attributes are the following: ."
+ ],
+ "sqlState" : "22003"
+ },
"COMPARATOR_RETURNS_NULL" : {
"message" : [
"The comparator has returned a NULL for a comparison between and .",
@@ -585,6 +656,11 @@
"Cannot process input data types for the expression: ."
],
"subClass" : {
+ "BAD_INPUTS" : {
+ "message" : [
+ "The input data types to must be valid, but found the input types ."
+ ]
+ },
"MISMATCHED_TYPES" : {
"message" : [
"All input types must be the same except nullable, containsNull, valueContainsNull flags, but found the input types ."
@@ -611,6 +687,27 @@
],
"sqlState" : "40000"
},
+ "CONFLICTING_DIRECTORY_STRUCTURES" : {
+ "message" : [
+ "Conflicting directory structures detected.",
+ "Suspicious paths:",
+ "",
+ "If provided paths are partition directories, please set \"basePath\" in the options of the data source to specify the root directory of the table.",
+ "If there are multiple root directories, please load them separately and then union them."
+ ],
+ "sqlState" : "KD009"
+ },
+ "CONFLICTING_PARTITION_COLUMN_NAMES" : {
+ "message" : [
+ "Conflicting partition column names detected:",
+ "",
+ "For partitioned table directories, data files should only live in leaf directories.",
+ "And directories at the same level should have the same partition column name.",
+ "Please check the following directories for unexpected files or inconsistent partition column names:",
+ ""
+ ],
+ "sqlState" : "KD009"
+ },
"CONNECT" : {
"message" : [
"Generic Spark Connect error."
@@ -853,7 +950,7 @@
},
"NON_STRING_TYPE" : {
"message" : [
- "all arguments must be strings."
+ "all arguments of the function must be strings."
]
},
"NULL_TYPE" : {
@@ -868,7 +965,7 @@
},
"RANGE_FRAME_INVALID_TYPE" : {
"message" : [
- "The data type used in the order specification does not match the data type which is used in the range frame."
+ "The data type used in the order specification does not support the data type which is used in the range frame."
]
},
"RANGE_FRAME_MULTI_ORDER" : {
@@ -954,16 +1051,6 @@
"The input of can't be type data."
]
},
- "UNSUPPORTED_UDF_INPUT_TYPE" : {
- "message" : [
- "UDFs do not support '' as an input data type."
- ]
- },
- "UNSUPPORTED_UDF_OUTPUT_TYPE" : {
- "message" : [
- "UDFs do not support '' as an output data type."
- ]
- },
"VALUE_OUT_OF_RANGE" : {
"message" : [
"The must be between (current value = )."
@@ -994,6 +1081,12 @@
],
"sqlState" : "42710"
},
+ "DATA_SOURCE_EXTERNAL_ERROR" : {
+ "message" : [
+ "Encountered error when saving to external data source."
+ ],
+ "sqlState" : "KD010"
+ },
"DATA_SOURCE_NOT_EXIST" : {
"message" : [
"Data source '' not found. Please make sure the data source is registered."
@@ -1014,6 +1107,12 @@
],
"sqlState" : "42K03"
},
+ "DATETIME_FIELD_OUT_OF_BOUNDS" : {
+ "message" : [
+ ". If necessary set to \"false\" to bypass this error."
+ ],
+ "sqlState" : "22023"
+ },
"DATETIME_OVERFLOW" : {
"message" : [
"Datetime operation overflow: ."
@@ -1039,6 +1138,12 @@
],
"sqlState" : "42608"
},
+ "DEFAULT_UNSUPPORTED" : {
+ "message" : [
+ "Failed to execute command because DEFAULT values are not supported for target data source with table provider: \"\"."
+ ],
+ "sqlState" : "42623"
+ },
"DISTINCT_WINDOW_FUNCTION_UNSUPPORTED" : {
"message" : [
"Distinct window functions are not supported: ."
@@ -1051,6 +1156,12 @@
],
"sqlState" : "22012"
},
+ "DUPLICATED_CTE_NAMES" : {
+ "message" : [
+ "CTE definition can't have duplicate names: ."
+ ],
+ "sqlState" : "42602"
+ },
"DUPLICATED_FIELD_NAME_IN_ARROW_STRUCT" : {
"message" : [
"Duplicated field names in Arrow Struct are not allowed, got ."
@@ -1121,6 +1232,12 @@
],
"sqlState" : "42604"
},
+ "EMPTY_SCHEMA_NOT_SUPPORTED_FOR_DATASOURCE" : {
+ "message" : [
+ "The datasource does not support writing empty or nested empty schemas. Please make sure the data schema has at least one or more column(s)."
+ ],
+ "sqlState" : "0A000"
+ },
"ENCODER_NOT_FOUND" : {
"message" : [
"Not found an encoder of the type to Spark SQL internal representation.",
@@ -1387,6 +1504,12 @@
],
"sqlState" : "2203G"
},
+ "FAILED_TO_LOAD_ROUTINE" : {
+ "message" : [
+ "Failed to load routine ."
+ ],
+ "sqlState" : "38000"
+ },
"FAILED_TO_PARSE_TOO_COMPLEX" : {
"message" : [
"The statement, including potential SQL functions and referenced views, was too complex to parse.",
@@ -1412,6 +1535,12 @@
],
"sqlState" : "42704"
},
+ "FLATMAPGROUPSWITHSTATE_USER_FUNCTION_ERROR" : {
+ "message" : [
+ "An error occurred in the user provided function in flatMapGroupsWithState. Reason: "
+ ],
+ "sqlState" : "39000"
+ },
"FORBIDDEN_OPERATION" : {
"message" : [
"The operation is not allowed on the : ."
@@ -1424,6 +1553,12 @@
],
"sqlState" : "39000"
},
+ "FOREACH_USER_FUNCTION_ERROR" : {
+ "message" : [
+ "An error occurred in the user provided function in foreach sink. Reason: "
+ ],
+ "sqlState" : "39000"
+ },
"FOUND_MULTIPLE_DATA_SOURCES" : {
"message" : [
"Detected multiple data sources with the name ''. Please check the data source isn't simultaneously registered and located in the classpath."
@@ -1520,6 +1655,36 @@
],
"sqlState" : "42601"
},
+ "IDENTITY_COLUMNS_DUPLICATED_SEQUENCE_GENERATOR_OPTION" : {
+ "message" : [
+ "Duplicated IDENTITY column sequence generator option: ."
+ ],
+ "sqlState" : "42601"
+ },
+ "IDENTITY_COLUMNS_ILLEGAL_STEP" : {
+ "message" : [
+ "IDENTITY column step cannot be 0."
+ ],
+ "sqlState" : "42611"
+ },
+ "IDENTITY_COLUMNS_UNSUPPORTED_DATA_TYPE" : {
+ "message" : [
+ "DataType is not supported for IDENTITY columns."
+ ],
+ "sqlState" : "428H2"
+ },
+ "IDENTITY_COLUMN_WITH_DEFAULT_VALUE" : {
+ "message" : [
+ "A column cannot have both a default value and an identity column specification but column has default value: () and identity column specification: ()."
+ ],
+ "sqlState" : "42623"
+ },
+ "ILLEGAL_DAY_OF_WEEK" : {
+ "message" : [
+ "Illegal input for day of week: ."
+ ],
+ "sqlState" : "22009"
+ },
"ILLEGAL_STATE_STORE_VALUE" : {
"message" : [
"Illegal value provided to the State Store"
@@ -1859,8 +2024,20 @@
},
"INTERVAL_ARITHMETIC_OVERFLOW" : {
"message" : [
- "."
+ "Integer overflow while operating with intervals."
],
+ "subClass" : {
+ "WITHOUT_SUGGESTION" : {
+ "message" : [
+ "Try devising appropriate values for the interval parameters."
+ ]
+ },
+ "WITH_SUGGESTION" : {
+ "message" : [
+ "Use to tolerate overflow and return NULL instead."
+ ]
+ }
+ },
"sqlState" : "22015"
},
"INTERVAL_DIVIDED_BY_ZERO" : {
@@ -1897,6 +2074,12 @@
},
"sqlState" : "42903"
},
+ "INVALID_AGNOSTIC_ENCODER" : {
+ "message" : [
+ "Found an invalid agnostic encoder. Expects an instance of AgnosticEncoder but got . For more information consult '/api/java/index.html?org/apache/spark/sql/Encoder.html'."
+ ],
+ "sqlState" : "42001"
+ },
"INVALID_ARRAY_INDEX" : {
"message" : [
"The index is out of bounds. The array has elements. Use the SQL function `get()` to tolerate accessing element at invalid index and return NULL instead. If necessary set to \"false\" to bypass this error."
@@ -1909,12 +2092,24 @@
],
"sqlState" : "22003"
},
+ "INVALID_ATTRIBUTE_NAME_SYNTAX" : {
+ "message" : [
+ "Syntax error in the attribute name: . Check that backticks appear in pairs, a quoted string is a complete name part and use a backtick only inside quoted name parts."
+ ],
+ "sqlState" : "42601"
+ },
"INVALID_BITMAP_POSITION" : {
"message" : [
"The 0-indexed bitmap position is out of bounds. The bitmap has bits ( bytes)."
],
"sqlState" : "22003"
},
+ "INVALID_BOOLEAN_STATEMENT" : {
+ "message" : [
+ "Boolean statement is expected in the condition, but was found."
+ ],
+ "sqlState" : "22546"
+ },
"INVALID_BOUNDARY" : {
"message" : [
"The boundary is invalid: ."
@@ -1981,6 +2176,12 @@
},
"sqlState" : "22022"
},
+ "INVALID_CORRUPT_RECORD_TYPE" : {
+ "message" : [
+ "The column for corrupt records must have the nullable STRING type, but got ."
+ ],
+ "sqlState" : "42804"
+ },
"INVALID_CURSOR" : {
"message" : [
"The cursor is invalid."
@@ -2023,6 +2224,11 @@
"message" : [
"Too many letters in datetime pattern: . Please reduce pattern length."
]
+ },
+ "SECONDS_FRACTION" : {
+ "message" : [
+ "Cannot detect a seconds fraction pattern of variable length. Please make sure the pattern contains 'S', and does not contain illegal characters."
+ ]
}
},
"sqlState" : "22007"
@@ -2126,6 +2332,12 @@
],
"sqlState" : "42001"
},
+ "INVALID_EXTERNAL_TYPE" : {
+ "message" : [
+ "The external type is not valid for the type at the expression ."
+ ],
+ "sqlState" : "42K0N"
+ },
"INVALID_EXTRACT_BASE_FIELD_TYPE" : {
"message" : [
"Can't extract a value from . Need a complex type [STRUCT, ARRAY, MAP] but got ."
@@ -2215,7 +2427,8 @@
},
"INVALID_FRACTION_OF_SECOND" : {
"message" : [
- "The fraction of sec must be zero. Valid range is [0, 60]. If necessary set to \"false\" to bypass this error."
+ "Valid range for seconds is [0, 60] (inclusive), but the provided value is . To avoid this error, use `try_make_timestamp`, which returns NULL on error.",
+ "If you do not want to use the session default timestamp version of this function, use `try_make_timestamp_ntz` or `try_make_timestamp_ltz`."
],
"sqlState" : "22023"
},
@@ -2315,6 +2528,11 @@
"Uncaught arithmetic exception while parsing ''."
]
},
+ "DAY_TIME_PARSING" : {
+ "message" : [
+ "Error parsing interval day-time string: ."
+ ]
+ },
"INPUT_IS_EMPTY" : {
"message" : [
"Interval string cannot be empty."
@@ -2325,6 +2543,11 @@
"Interval string cannot be null."
]
},
+ "INTERVAL_PARSING" : {
+ "message" : [
+ "Error parsing interval string."
+ ]
+ },
"INVALID_FRACTION" : {
"message" : [
" cannot have fractional part."
@@ -2360,19 +2583,50 @@
"Expect a unit name after but hit EOL."
]
},
+ "SECOND_NANO_FORMAT" : {
+ "message" : [
+ "Interval string does not match second-nano format of ss.nnnnnnnnn."
+ ]
+ },
+ "TIMEZONE_INTERVAL_OUT_OF_RANGE" : {
+ "message" : [
+ "The interval value must be in the range of [-18, +18] hours with second precision."
+ ]
+ },
"UNKNOWN_PARSING_ERROR" : {
"message" : [
"Unknown error when parsing ."
]
},
+ "UNMATCHED_FORMAT_STRING" : {
+ "message" : [
+ "Interval string does not match format of when cast to : ."
+ ]
+ },
+ "UNMATCHED_FORMAT_STRING_WITH_NOTICE" : {
+ "message" : [
+ "Interval string does not match format of when cast to : . Set \"spark.sql.legacy.fromDayTimeString.enabled\" to \"true\" to restore the behavior before Spark 3.0."
+ ]
+ },
"UNRECOGNIZED_NUMBER" : {
"message" : [
"Unrecognized number ."
]
+ },
+ "UNSUPPORTED_FROM_TO_EXPRESSION" : {
+ "message" : [
+ "Cannot support (interval '' to ) expression."
+ ]
}
},
"sqlState" : "22006"
},
+ "INVALID_INTERVAL_WITH_MICROSECONDS_ADDITION" : {
+ "message" : [
+ "Cannot add an interval to a date because its microseconds part is not 0. If necessary set to \"false\" to bypass this error."
+ ],
+ "sqlState" : "22006"
+ },
"INVALID_INVERSE_DISTRIBUTION_FUNCTION" : {
"message" : [
"Invalid inverse distribution function ."
@@ -2396,6 +2650,13 @@
},
"sqlState" : "42K0K"
},
+ "INVALID_JAVA_IDENTIFIER_AS_FIELD_NAME" : {
+ "message" : [
+ " is not a valid identifier of Java and cannot be used as field name",
+ "."
+ ],
+ "sqlState" : "46121"
+ },
"INVALID_JOIN_TYPE_FOR_JOINWITH" : {
"message" : [
"Invalid join type in joinWith: ."
@@ -2414,6 +2675,12 @@
],
"sqlState" : "2203G"
},
+ "INVALID_JSON_RECORD_TYPE" : {
+ "message" : [
+ "Detected an invalid type of a JSON record while inferring a common schema in the mode . Expected a STRUCT type, but found ."
+ ],
+ "sqlState" : "22023"
+ },
"INVALID_JSON_ROOT_FIELD" : {
"message" : [
"Cannot convert JSON root field to target Spark type."
@@ -2432,6 +2699,24 @@
],
"sqlState" : "F0000"
},
+ "INVALID_LABEL_USAGE" : {
+ "message" : [
+ "The usage of the label is invalid."
+ ],
+ "subClass" : {
+ "DOES_NOT_EXIST" : {
+ "message" : [
+ "Label was used in the statement, but the label does not belong to any surrounding block."
+ ]
+ },
+ "ITERATE_IN_COMPOUND" : {
+ "message" : [
+ "ITERATE statement cannot be used with a label that belongs to a compound (BEGIN...END) body."
+ ]
+ }
+ },
+ "sqlState" : "42K0L"
+ },
"INVALID_LAMBDA_FUNCTION_CALL" : {
"message" : [
"Invalid lambda function call."
@@ -2598,6 +2883,11 @@
"expects an integer value in [0, ), but got ."
]
},
+ "BOOLEAN" : {
+ "message" : [
+ "expects a boolean literal, but got ."
+ ]
+ },
"CHARSET" : {
"message" : [
"expects one of the , but got ."
@@ -2608,11 +2898,31 @@
"expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK, DAY, DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the string literal ."
]
},
+ "DOUBLE" : {
+ "message" : [
+ "expects an double literal, but got ."
+ ]
+ },
+ "DTYPE" : {
+ "message" : [
+ "Unsupported dtype: . Valid values: float64, float32."
+ ]
+ },
+ "INTEGER" : {
+ "message" : [
+ "expects an integer literal, but got ."
+ ]
+ },
"LENGTH" : {
"message" : [
"Expects `length` greater than or equal to 0, but got ."
]
},
+ "LONG" : {
+ "message" : [
+ "expects a long literal, but got ."
+ ]
+ },
"NULL" : {
"message" : [
"expects a non-NULL value."
@@ -2633,6 +2943,11 @@
"Expects a positive or a negative value for `start`, but got 0."
]
},
+ "STRING" : {
+ "message" : [
+ "expects a string literal, but got ."
+ ]
+ },
"ZERO_INDEX" : {
"message" : [
"expects %1$, %2$ and so on, but got %0$."
@@ -2665,6 +2980,12 @@
},
"sqlState" : "42601"
},
+ "INVALID_PARTITION_VALUE" : {
+ "message" : [
+ "Failed to cast value to data type for partition column . Ensure the value matches the expected data type for this partition column."
+ ],
+ "sqlState" : "42846"
+ },
"INVALID_PROPERTY_KEY" : {
"message" : [
" is an invalid property key, please use quotes, e.g. SET =."
@@ -2683,6 +3004,18 @@
],
"sqlState" : "42613"
},
+ "INVALID_REGEXP_REPLACE" : {
+ "message" : [
+ "Could not perform regexp_replace for source = \"