From bfe89f029a4f92c0bc04bf88c06b7eb0146a9bf7 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Wed, 24 Jul 2019 16:03:51 +0800 Subject: [PATCH 01/62] 2.2.0 -> 2.3.0 (#947) --- assembly/pom.xml | 2 +- core/pom.xml | 2 +- core/scripts/version.sh | 2 +- pom.xml | 2 +- spark-wrapper/spark-2.3/pom.xml | 2 +- spark-wrapper/spark-2.4/pom.xml | 2 +- tikv-client/pom.xml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/assembly/pom.xml b/assembly/pom.xml index 5725ab31e6..66ab98e29f 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -5,7 +5,7 @@ com.pingcap.tispark tispark-parent - 2.2.0-SNAPSHOT + 2.3.0-SNAPSHOT ../pom.xml diff --git a/core/pom.xml b/core/pom.xml index e8e02155a1..3e6928308e 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -5,7 +5,7 @@ com.pingcap.tispark tispark-parent - 2.2.0-SNAPSHOT + 2.3.0-SNAPSHOT ../pom.xml diff --git a/core/scripts/version.sh b/core/scripts/version.sh index aaafc9a917..9c2ff18d51 100755 --- a/core/scripts/version.sh +++ b/core/scripts/version.sh @@ -15,7 +15,7 @@ # cd .. -TiSparkReleaseVersion=2.2.0-SNAPSHOT +TiSparkReleaseVersion=2.3.0-SNAPSHOT TiSparkBuildTS=`date -u '+%Y-%m-%d %I:%M:%S'` TiSparkGitHash=`git rev-parse HEAD` TiSparkGitBranch=`git rev-parse --abbrev-ref HEAD` diff --git a/pom.xml b/pom.xml index 6674df45d8..1ce1a765af 100644 --- a/pom.xml +++ b/pom.xml @@ -10,7 +10,7 @@ com.pingcap.tispark tispark-parent - 2.2.0-SNAPSHOT + 2.3.0-SNAPSHOT pom TiSpark Project Parent POM http://github.copm/pingcap/tispark diff --git a/spark-wrapper/spark-2.3/pom.xml b/spark-wrapper/spark-2.3/pom.xml index d3a4d7127f..be9401c59e 100644 --- a/spark-wrapper/spark-2.3/pom.xml +++ b/spark-wrapper/spark-2.3/pom.xml @@ -5,7 +5,7 @@ com.pingcap.tispark tispark-parent - 2.2.0-SNAPSHOT + 2.3.0-SNAPSHOT ../../pom.xml diff --git a/spark-wrapper/spark-2.4/pom.xml b/spark-wrapper/spark-2.4/pom.xml index 57cc674dcd..e3dc0830b7 100644 --- a/spark-wrapper/spark-2.4/pom.xml +++ b/spark-wrapper/spark-2.4/pom.xml @@ -5,7 +5,7 @@ com.pingcap.tispark tispark-parent - 2.2.0-SNAPSHOT + 2.3.0-SNAPSHOT ../../pom.xml diff --git a/tikv-client/pom.xml b/tikv-client/pom.xml index 10fbed631e..549c789ca6 100644 --- a/tikv-client/pom.xml +++ b/tikv-client/pom.xml @@ -5,7 +5,7 @@ com.pingcap.tispark tispark-parent - 2.2.0-SNAPSHOT + 2.3.0-SNAPSHOT ../pom.xml From 5de9ab8d1e769074cf0a78ba70250a281805ae2c Mon Sep 17 00:00:00 2001 From: birdstorm Date: Wed, 24 Jul 2019 19:41:48 +0800 Subject: [PATCH 02/62] Add tests for primary key (#948) --- .../spark/sql/BaseTestGenerationSpec.scala | 3 + .../apache/spark/sql/BaseTiSparkTest.scala | 16 ------ .../spark/sql/test/generator/Schema.scala | 22 ++++++-- .../test/generator/TestDataGenerator.scala | 14 +++-- .../sql/test/generator/ValueGenerator.scala | 18 ++++++ .../spark/sql/types/BaseDataTypeTest.scala | 56 ++++--------------- .../spark/sql/types/DataTypeExampleTest.scala | 51 +++++++++++++++++ .../spark/sql/types/DataTypeNormalSuite.scala | 4 +- ...taType.scala => DataTypePKGenerator.scala} | 14 ++--- .../GenerateUnitDataTypeTestAction.scala | 2 +- .../spark/sql/types/pk/DataTypePKSuite.scala | 8 +-- 11 files changed, 123 insertions(+), 85 deletions(-) create mode 100644 core/src/test/scala/org/apache/spark/sql/types/DataTypeExampleTest.scala rename core/src/test/scala/org/apache/spark/sql/types/{GeneratePKDataType.scala => DataTypePKGenerator.scala} (73%) diff --git a/core/src/test/scala/org/apache/spark/sql/BaseTestGenerationSpec.scala b/core/src/test/scala/org/apache/spark/sql/BaseTestGenerationSpec.scala index 4ee5f3ef92..2af2491d83 100644 --- a/core/src/test/scala/org/apache/spark/sql/BaseTestGenerationSpec.scala +++ b/core/src/test/scala/org/apache/spark/sql/BaseTestGenerationSpec.scala @@ -18,6 +18,9 @@ package org.apache.spark.sql trait BaseTestGenerationSpec { + + protected val rowCount: Int = 50 + protected def getTableName(dataType: String): String = s"test_$dataType" protected def getTableName(dataType: String, desc: String): String = s"test_${desc}_$dataType" diff --git a/core/src/test/scala/org/apache/spark/sql/BaseTiSparkTest.scala b/core/src/test/scala/org/apache/spark/sql/BaseTiSparkTest.scala index b1858bc459..23ba77fe7e 100644 --- a/core/src/test/scala/org/apache/spark/sql/BaseTiSparkTest.scala +++ b/core/src/test/scala/org/apache/spark/sql/BaseTiSparkTest.scala @@ -431,22 +431,6 @@ class BaseTiSparkTest extends QueryTest with SharedSQLContext with BaseTestGener df.collect.foreach(println) } - def simpleSelect(dbName: String, dataType: String): Unit = { - spark.sql("show databases").show(false) - setCurrentDatabase(dbName) - val tblName = getTableName(dataType) - val query = s"select ${getColumnName(dataType)} from $tblName" - runTest(query) - } - - def simpleSelect(dbName: String, dataType: String, desc: String): Unit = { - spark.sql("show databases").show(false) - setCurrentDatabase(dbName) - val tblName = getTableName(dataType, desc) - val query = s"select ${getColumnName(dataType)} from $tblName" - runTest(query) - } - protected def time[A](f: => A): A = { val s = System.currentTimeMillis val ret = f diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/Schema.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/Schema.scala index 587c4cb47c..33219056e2 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/Schema.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/Schema.scala @@ -37,10 +37,6 @@ case class Schema(database: String, assert(columnDesc.size == columnNames.size, "columnDesc size not equal to column name size") assert(columnNames.forall(columnDesc.contains), "column desc not present for some columns") - val columnInfo: List[ColumnInfo] = columnNames.map { col => - val x = columnDesc(col) - ColumnInfo(col, x._1, x._2, x._3) - } val indexInfo: List[IndexInfo] = indexColumns.map { idx => IndexInfo(idx._1, idx._2._1.map { x => IndexColumnInfo(x._1, x._2) @@ -49,6 +45,24 @@ case class Schema(database: String, assert(indexInfo.count(_.isPrimary) <= 1, "more than one primary key exist in schema") + private val pkIndexInfo = indexInfo.filter(_.isPrimary) + private val pkColumnName = if (pkIndexInfo.isEmpty) { + "" + } else if (pkIndexInfo.head.indexColumns.size == 1) { + pkIndexInfo.head.indexColumns.head.column + } else { + throw new IllegalArgumentException("Multi-column Primary key/Unique index not supported yet") + } + + val columnInfo: List[ColumnInfo] = columnNames.map { col => + val x = columnDesc(col) + if (col == pkColumnName) { + ColumnInfo(col, x._1, x._2, x._3 + " primary key") + } else { + ColumnInfo(col, x._1, x._2, x._3) + } + } + // column info to string private val columns: List[String] = columnInfo.map(_.toString) private val keys: List[String] = indexInfo.map(_.toString) diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/TestDataGenerator.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/TestDataGenerator.scala index 8791389075..6c543444f0 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/TestDataGenerator.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/TestDataGenerator.scala @@ -268,11 +268,17 @@ object TestDataGenerator { offset: Int, r: Random, valueGenerator: ValueGenerator): Unit = { - if (valueGenerator.randomNull(r)) { - row.setNull(offset) - } else { - val value = valueGenerator.randomValue(r) + if (valueGenerator.isPrimaryKey) { + assert(!valueGenerator.nullable, "Generate fails: Value cannot be null for primary key") + val value = valueGenerator.randomUniqueValue(r) row.set(offset, valueGenerator.tiDataType, value) + } else { + if (valueGenerator.randomNull(r)) { + row.setNull(offset) + } else { + val value = valueGenerator.randomValue(r) + row.set(offset, valueGenerator.tiDataType, value) + } } } diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/ValueGenerator.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/ValueGenerator.scala index 311d08b9bb..f7a290f1fc 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/ValueGenerator.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/ValueGenerator.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.test.generator import org.apache.spark.sql.test.generator.DataType._ import org.apache.spark.sql.test.generator.TestDataGenerator.{getLength, isNumeric} +import scala.collection.mutable import scala.util.Random case class ValueGenerator(dataType: ReflectedDataType, @@ -56,6 +57,23 @@ case class ValueGenerator(dataType: ReflectedDataType, !tiDataType.isNotNull && r.nextInt(20) == 0 } + val set: mutable.Set[Any] = mutable.HashSet.empty[Any] + + def randomUniqueValue(r: Random): Any = { + while (true) { + val value = randomValue(r) + val hashedValue = value match { + case null => "null" + case b: Array[Byte] => b.mkString("[", ",", "]") + case x => x.toString + } + if (!set.apply(hashedValue)) { + set += hashedValue + return value + } + } + } + def randomValue(r: Random): Any = { if (tiDataType.isUnsigned) { if (!isNumeric(dataType)) { diff --git a/core/src/test/scala/org/apache/spark/sql/types/BaseDataTypeTest.scala b/core/src/test/scala/org/apache/spark/sql/types/BaseDataTypeTest.scala index 99dee30b8f..c0aee58f25 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/BaseDataTypeTest.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/BaseDataTypeTest.scala @@ -1,53 +1,19 @@ -/* - * - * Copyright 2017 PingCAP, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - package org.apache.spark.sql.types import org.apache.spark.sql.BaseTiSparkTest -import org.apache.spark.sql.test.SharedSQLContext -import org.apache.spark.sql.test.generator.DataType.{ReflectedDataType, TINYINT} - -class BaseDataTypeTest extends BaseTiSparkTest with RunUnitDataTypeTestAction { - - val dataTypes: List[ReflectedDataType] = List(TINYINT) - val unsignedDataTypes: List[ReflectedDataType] = List(TINYINT) - val dataTypeTestDir: String = "dataType-test" - val database: String = "data_type_test_example" - val testDesc: String = "Base test for data types" - def startTest(typeName: String): Unit = { - test(s"${preDescription}Test $typeName - $testDesc") { - simpleSelect(database, typeName) - } +trait BaseDataTypeTest extends BaseTiSparkTest { + def simpleSelect(dbName: String, dataType: String): Unit = { + setCurrentDatabase(dbName) + val tblName = getTableName(dataType) + val query = s"select ${getColumnName(dataType)} from $tblName" + runTest(query) } - def startUnsignedTest(typeName: String): Unit = { - test(s"${preDescription}Test $extraDesc $typeName - $testDesc") { - simpleSelect(database, typeName, extraDesc) - } + def simpleSelect(dbName: String, dataType: String, desc: String): Unit = { + setCurrentDatabase(dbName) + val tblName = getTableName(dataType, desc) + val query = s"select ${getColumnName(dataType)} from $tblName" + runTest(query) } - - def check(): Unit = { - SharedSQLContext.init() - if (generateData) { - BaseGenerateDataType(dataTypes, unsignedDataTypes, dataTypeTestDir, database, testDesc).test() - } - } - - check() - test() } diff --git a/core/src/test/scala/org/apache/spark/sql/types/DataTypeExampleTest.scala b/core/src/test/scala/org/apache/spark/sql/types/DataTypeExampleTest.scala new file mode 100644 index 0000000000..155b89eca5 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/DataTypeExampleTest.scala @@ -0,0 +1,51 @@ +/* + * + * Copyright 2017 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types + +import org.apache.spark.sql.test.SharedSQLContext +import org.apache.spark.sql.test.generator.DataType.{ReflectedDataType, TINYINT} + +class DataTypeExampleTest extends BaseDataTypeTest with RunUnitDataTypeTestAction { + val dataTypes: List[ReflectedDataType] = List(TINYINT) + val unsignedDataTypes: List[ReflectedDataType] = List(TINYINT) + val dataTypeTestDir: String = "dataType-test" + val database: String = "data_type_test_example" + val testDesc: String = "Base test for data types" + + def startTest(typeName: String): Unit = { + test(s"${preDescription}Test $typeName - $testDesc") { + simpleSelect(database, typeName) + } + } + + def startUnsignedTest(typeName: String): Unit = { + test(s"${preDescription}Test $extraDesc $typeName - $testDesc") { + simpleSelect(database, typeName, extraDesc) + } + } + + def check(): Unit = { + SharedSQLContext.init() + if (generateData) { + BaseGenerateDataType(dataTypes, unsignedDataTypes, dataTypeTestDir, database, testDesc).test() + } + } + + check() + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/DataTypeNormalSuite.scala b/core/src/test/scala/org/apache/spark/sql/types/DataTypeNormalSuite.scala index d2bcefd664..799bcf3e25 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/DataTypeNormalSuite.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/DataTypeNormalSuite.scala @@ -15,13 +15,11 @@ package org.apache.spark.sql.types -import org.apache.spark.sql.BaseTiSparkTest import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.test.generator.DataType._ import org.apache.spark.sql.test.generator.TestDataGenerator._ -class DataTypeNormalSuite extends BaseTiSparkTest with RunUnitDataTypeTestAction { - +class DataTypeNormalSuite extends BaseDataTypeTest with RunUnitDataTypeTestAction { override val dataTypes: List[ReflectedDataType] = integers ::: decimals ::: doubles ::: stringType override val unsignedDataTypes: List[ReflectedDataType] = integers ::: decimals ::: doubles override val dataTypeTestDir = "dataType-test" diff --git a/core/src/test/scala/org/apache/spark/sql/types/GeneratePKDataType.scala b/core/src/test/scala/org/apache/spark/sql/types/DataTypePKGenerator.scala similarity index 73% rename from core/src/test/scala/org/apache/spark/sql/types/GeneratePKDataType.scala rename to core/src/test/scala/org/apache/spark/sql/types/DataTypePKGenerator.scala index 2eff2b4d17..9630ef510a 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/GeneratePKDataType.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/DataTypePKGenerator.scala @@ -20,11 +20,11 @@ package org.apache.spark.sql.types import org.apache.spark.sql.BaseTiSparkTest import org.apache.spark.sql.test.generator.DataType.ReflectedDataType -case class GeneratePKDataType(dataTypes: List[ReflectedDataType], - unsignedDataTypes: List[ReflectedDataType], - dataTypeTestDir: String, - database: String, - testDesc: String) +case class DataTypePKGenerator(dataTypes: List[ReflectedDataType], + unsignedDataTypes: List[ReflectedDataType], + dataTypeTestDir: String, + database: String, + testDesc: String) extends BaseTiSparkTest with GeneratePKDataTypeTestAction { def loadTestData(typeName: String): Unit = { @@ -38,8 +38,8 @@ case class GeneratePKDataType(dataTypes: List[ReflectedDataType], } } -object GeneratePKDataType { +object DataTypePKGenerator { def apply(u: UnitDataTypeTestAction): GeneratePKDataTypeTestAction = { - GeneratePKDataType(u.dataTypes, u.unsignedDataTypes, u.dataTypeTestDir, u.database, u.testDesc) + DataTypePKGenerator(u.dataTypes, u.unsignedDataTypes, u.dataTypeTestDir, u.database, u.testDesc) } } diff --git a/core/src/test/scala/org/apache/spark/sql/types/GenerateUnitDataTypeTestAction.scala b/core/src/test/scala/org/apache/spark/sql/types/GenerateUnitDataTypeTestAction.scala index f215bec814..67f235e8f4 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/GenerateUnitDataTypeTestAction.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/GenerateUnitDataTypeTestAction.scala @@ -41,7 +41,7 @@ trait GenerateUnitDataTypeTestAction extends UnitDataTypeTestAction with BaseTes ) } - def genData(schema: Schema): Data = randomDataGenerator(schema, 20, dataTypeTestDir, r) + def genData(schema: Schema): Data = randomDataGenerator(schema, rowCount, dataTypeTestDir, r) def genLen(dataType: ReflectedDataType): String = { val baseType = getBaseType(dataType) diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/DataTypePKSuite.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/DataTypePKSuite.scala index 5310d74f2b..cce1ddd265 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/pk/DataTypePKSuite.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/DataTypePKSuite.scala @@ -15,14 +15,12 @@ package org.apache.spark.sql.types.pk -import org.apache.spark.sql.BaseTiSparkTest import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.test.generator.DataType._ import org.apache.spark.sql.test.generator.TestDataGenerator._ -import org.apache.spark.sql.types.{GeneratePKDataType, RunUnitDataTypeTestAction} - -class DataTypePKSuite extends BaseTiSparkTest with RunUnitDataTypeTestAction { +import org.apache.spark.sql.types.{BaseDataTypeTest, DataTypePKGenerator, RunUnitDataTypeTestAction} +class DataTypePKSuite extends BaseDataTypeTest with RunUnitDataTypeTestAction { override val dataTypes: List[ReflectedDataType] = integers ::: decimals ::: doubles ::: stringType override val unsignedDataTypes: List[ReflectedDataType] = integers ::: decimals ::: doubles override val dataTypeTestDir = "dataType-test-pk" @@ -44,7 +42,7 @@ class DataTypePKSuite extends BaseTiSparkTest with RunUnitDataTypeTestAction { def check(): Unit = { SharedSQLContext.init() if (generateData) { - GeneratePKDataType.apply(this).test() + DataTypePKGenerator.apply(this).test() } } From d42330b0a451b58fd836c5fa1f7c987ca3165f82 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Mon, 29 Jul 2019 14:50:36 +0800 Subject: [PATCH 03/62] add changelog (#955) --- CHANGELOG.md | 134 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000000..98607cc165 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,134 @@ +# TiSpark Changelog +All notable changes to this project will be documented in this file. + +## [TiSpark 2.2.0] +### New Features +* Natively support writing data to TiKV using Spark Data Source API +* Support select from partition table [#916](https://github.com/pingcap/tispark/pull/916) +* Release one tispark jar (both support Spark-2.3.x and Spark-2.4.x) instead of two [#933](https://github.com/pingcap/tispark/pull/933) +* Add spark version to tispark udf ti_version [#943](https://github.com/pingcap/tispark/pull/943) + +## [TiSpark 2.1.2] 2019-07-29 +### Fixes +* Fix improper response with region error [#922](https://github.com/pingcap/tispark/pull/922) +* Fix view parseing problem [#953](https://github.com/pingcap/tispark/pull/953) + +## [TiSpark 1.2.1] +### Fixes +* Fix count error, if advanceNextResponse is empty, we should read next region (#899) +* Use fixed version of proto (#898) + +## [TiSpark 2.1.1] +### Fixes +* Add TiDB/TiKV/PD version and Spark version supported for each latest major release (#804) (#887) +* Fix incorrect timestamp of tidbMapDatabase (#862) (#885) +* Fix column size estimation (#858) (#884) +* Fix count error, if advanceNextResponse is empty, we should read next region (#878) (#882) +* Use fixed version of proto instead of master branch (#843) (#850) + +## [TiSpark 2.1] +### Features +* Support range partition pruning (Beta) (#599) +* Support show columns command (#614) + +### Fixes +* Fix build key ranges with xor expression (#576) +* Fix cannot initialize pd if using ipv6 address (#587) +* Fix default value bug (#596) +* Fix possible IndexOutOfBoundException in KeyUtils (#597) +* Fix outputOffset is incorrect when building DAGRequest (#615) +* Fix incorrect implementation of Key.next() (#648) +* Fix partition parser can't parser numerical value 0 (#651) +* Fix prefix length may be larger than the value used. (#668) +* Fix retry logic when scan meet lock (#666) +* Fix inconsistent timestamp (#676) +* Fix tempView may be unresolved when applying timestamp to plan (#690) +* Fix concurrent DAGRequest issue (#714) +* Fix downgrade scan logic (#725) +* Fix integer type default value should be parsed to long (#741) +* Fix index scan on partition table (#735) +* Fix KeyNotInRegion may occur when retrieving rows by handle (#755) +* Fix encode value long max (#761) +* Fix MatchErrorException may occur when Unsigned BigInt contains in group by columns (#780) +* Fix IndexOutOfBoundException when trying to get pd member (#788) + +## [TiSpark 2.0] +### Features +* Work with Spark 2.3 +* Support use `$database` statement +* Support show databases statement +* Support show tables statement +* No need to use `TiContext.mapTiDBDatabase`, use `$database.$table` to identify a table instead +* Support data type SET and ENUM +* Support data type YEAR +* Support data type TIME +* Support isolation level settings +* Support describe table command +* Support cache tables and uncache tables +* Support read from a TiDB partition table +* Support use TiDB as metastore + +### Fixes +* Fix JSON parsing (#491) +* Fix count on empty table (#498) +* Fix ScanIterator unable to read from adjacent empty regions (#519) +* Fix possible NullPointerException when setting show_row_id true (#522) + +### Improved +* Make ti version usable without selecting database (#545) + +## [TiSpark 1.2] +### Fixes +* Fixes compatibility with PDServer #480 + +## [TiSpark 1.1] +### Fixes multiple bugs: +* Fix daylight saving time (DST) (#347) +* Fix count(1) result is always 0 if subquery contains limit (#346) +* Fix incorrect totalRowCount calculation (#353) +* Fix request fail with Key not in region after retrying NotLeaderError (#354) +* Fix ScanIterator logic where index may be out of bound (#357) +* Fix tispark-sql dbName (#379) +* Fix StoreNotMatch (#396) +* Fix utf8 prefix index (#400) +* Fix decimal decoding (#401) +* Refactor not leader logic (#412) +* Fix global temp view not visible in thriftserver (#437) + +### Adds: +* Allow TiSpark retrieve row id (#367) +* Decode json to string (#417) + +### Improvements: +* Improve PD connection issue's error log (#388) +* Add DB prefix option for TiDB tables (#416) + +## [TiSpark 1.0.1] +* Fix unsigned index +* Compatible with TiDB before and since 48a42f + +## [TiSpark 1.0 GA] +### New Features +TiSpark provides distributed computing of TiDB data using Apache Spark. + +* Provide a gRPC communication framework to read data from TiKV +* Provide encoding and decoding of TiKV component data and communication protocol +* Provide calculation pushdown, which includes: + - Aggregate pushdown + - Predicate pushdown + - TopN pushdown + - Limit pushdown +* Provide index related support + - Transform predicate into Region key range or secondary index + - Optimize Index Only queries + - Adaptive downgrade index scan to table scan per region +* Provide cost-based optimization + - Support statistics + - Select index + - Estimate broadcast table cost +* Provide support for multiple Spark interfaces + - Support Spark Shell + - Support ThriftServer/JDBC + - Support Spark-SQL interaction + - Support PySpark Shell + - Support SparkR From b8354a8384117a2ccdcbe1c124cf89e77e5cc215 Mon Sep 17 00:00:00 2001 From: birdstorm Date: Mon, 29 Jul 2019 17:49:51 +0800 Subject: [PATCH 04/62] add multi-column tests (#954) --- .../spark/sql/BaseTestGenerationSpec.scala | 15 +- .../apache/spark/sql/BaseTiSparkTest.scala | 10 +- .../apache/spark/sql/MysqlDataTypeSuite.scala | 2 + .../apache/spark/sql/TiSparkTestSpec.scala | 29 ++++ .../spark/sql/test/SharedSQLContext.scala | 1 - .../spark/sql/test/generator/Data.scala | 2 + .../spark/sql/test/generator/DataType.scala | 2 + .../spark/sql/test/generator/Index.scala | 2 + .../sql/test/generator/IndexColumn.scala | 19 +-- .../spark/sql/test/generator/Schema.scala | 2 + .../test/generator/TestDataGenerator.scala | 83 +++++++++-- .../sql/test/generator/ValueGenerator.scala | 135 ++++++++++++++++-- ...Type.scala => BaseDataTypeGenerator.scala} | 14 +- .../spark/sql/types/BaseDataTypeTest.scala | 35 ++++- .../BaseMultiColumnDataTypeGenerator.scala | 35 +++++ .../spark/sql/types/DataTypeExampleTest.scala | 13 +- .../spark/sql/types/DataTypeNormalSuite.scala | 13 +- ...enerateMultiColumnDataTypeTestAction.scala | 79 ++++++++++ .../GenerateUnitDataTypeTestAction.scala | 20 ++- .../sql/types/MultiColumnDataTypeSuite.scala | 52 +++++++ .../sql/types/MultiColumnDataTypeTest.scala | 76 ++++++++++ ...cala => MultiColumnDataTypeTestSpec.scala} | 17 +-- .../RunMultiColumnDataTypeTestAction.scala | 29 ++++ .../sql/types/RunUnitDataTypeTestAction.scala | 4 +- .../sql/types/SpecialTiDBTypeTestSuite.scala | 2 +- .../sql/types/UnitDataTypeTestSpec.scala | 29 ++++ .../types/{ => pk}/DataTypePKGenerator.scala | 9 +- .../spark/sql/types/pk/DataTypePKSuite.scala | 15 +- .../GeneratePKDataTypeTestAction.scala | 7 +- 29 files changed, 651 insertions(+), 100 deletions(-) create mode 100644 core/src/test/scala/org/apache/spark/sql/TiSparkTestSpec.scala rename core/src/test/scala/org/apache/spark/sql/types/{BaseGenerateDataType.scala => BaseDataTypeGenerator.scala} (71%) create mode 100644 core/src/test/scala/org/apache/spark/sql/types/BaseMultiColumnDataTypeGenerator.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/GenerateMultiColumnDataTypeTestAction.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeSuite.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeTest.scala rename core/src/test/scala/org/apache/spark/sql/types/{UnitDataTypeTestAction.scala => MultiColumnDataTypeTestSpec.scala} (67%) create mode 100644 core/src/test/scala/org/apache/spark/sql/types/RunMultiColumnDataTypeTestAction.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/UnitDataTypeTestSpec.scala rename core/src/test/scala/org/apache/spark/sql/types/{ => pk}/DataTypePKGenerator.scala (83%) rename core/src/test/scala/org/apache/spark/sql/types/{ => pk}/GeneratePKDataTypeTestAction.scala (88%) diff --git a/core/src/test/scala/org/apache/spark/sql/BaseTestGenerationSpec.scala b/core/src/test/scala/org/apache/spark/sql/BaseTestGenerationSpec.scala index 2af2491d83..3868f129d1 100644 --- a/core/src/test/scala/org/apache/spark/sql/BaseTestGenerationSpec.scala +++ b/core/src/test/scala/org/apache/spark/sql/BaseTestGenerationSpec.scala @@ -1,6 +1,6 @@ /* * - * Copyright 2017 PingCAP, Inc. + * Copyright 2019 PingCAP, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,14 +19,17 @@ package org.apache.spark.sql trait BaseTestGenerationSpec { - protected val rowCount: Int = 50 + protected val rowCount: Int - protected def getTableName(dataType: String): String = s"test_$dataType" + protected val preDescription: String = "Generating Data for " - protected def getTableName(dataType: String, desc: String): String = s"test_${desc}_$dataType" + def getTableName(dataTypes: String*): String - protected def getColumnName(dataType: String): String = s"col_$dataType" + def getTableNameWithDesc(desc: String, dataTypes: String*): String - protected def getIndexName(dataType: String): String = s"idx_$dataType" + def getColumnName(dataType: String): String = s"col_$dataType" + + def getIndexName(dataTypes: String*): String = + s"idx_${dataTypes.map(getColumnName).mkString("_")}" } diff --git a/core/src/test/scala/org/apache/spark/sql/BaseTiSparkTest.scala b/core/src/test/scala/org/apache/spark/sql/BaseTiSparkTest.scala index 23ba77fe7e..94ec6a73a8 100644 --- a/core/src/test/scala/org/apache/spark/sql/BaseTiSparkTest.scala +++ b/core/src/test/scala/org/apache/spark/sql/BaseTiSparkTest.scala @@ -29,7 +29,7 @@ import org.apache.spark.sql.test.SharedSQLContext import scala.collection.mutable.ArrayBuffer -class BaseTiSparkTest extends QueryTest with SharedSQLContext with BaseTestGenerationSpec { +class BaseTiSparkTest extends QueryTest with SharedSQLContext { protected var tidbStmt: Statement = _ @@ -360,7 +360,9 @@ class BaseTiSparkTest extends QueryTest with SharedSQLContext with BaseTestGener try { r1 = queryViaTiSpark(qSpark) } catch { - case e: Throwable => fail(e) + case e: Throwable => + logger.error(s"TiSpark failed when executing: $qSpark") + fail(e) } } @@ -378,7 +380,7 @@ class BaseTiSparkTest extends QueryTest with SharedSQLContext with BaseTestGener r2 = queryViaTiSpark(qJDBC) } catch { case e: Throwable => - logger.warn(s"Spark with JDBC failed when executing:$qJDBC", e) // JDBC failed + logger.warn(s"Spark with JDBC failed when executing: $qJDBC", e) // JDBC failed } } @@ -387,7 +389,7 @@ class BaseTiSparkTest extends QueryTest with SharedSQLContext with BaseTestGener try { r3 = queryTiDBViaJDBC(qSpark) } catch { - case e: Throwable => logger.warn(s"TiDB failed when executing:$qSpark", e) // TiDB failed + case e: Throwable => logger.warn(s"TiDB failed when executing: $qSpark", e) // TiDB failed } } if (skipTiDB || !compSqlResult(qSpark, r1, r3, checkLimit)) { diff --git a/core/src/test/scala/org/apache/spark/sql/MysqlDataTypeSuite.scala b/core/src/test/scala/org/apache/spark/sql/MysqlDataTypeSuite.scala index 0997b299e2..8cb8f7246d 100644 --- a/core/src/test/scala/org/apache/spark/sql/MysqlDataTypeSuite.scala +++ b/core/src/test/scala/org/apache/spark/sql/MysqlDataTypeSuite.scala @@ -1,4 +1,5 @@ /* + * * Copyright 2019 PingCAP, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -11,6 +12,7 @@ * distributed under the License is distributed on an "AS IS" BASIS, * See the License for the specific language governing permissions and * limitations under the License. + * */ package org.apache.spark.sql diff --git a/core/src/test/scala/org/apache/spark/sql/TiSparkTestSpec.scala b/core/src/test/scala/org/apache/spark/sql/TiSparkTestSpec.scala new file mode 100644 index 0000000000..cf41dff643 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/TiSparkTestSpec.scala @@ -0,0 +1,29 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql + +import scala.util.Random + +trait TiSparkTestSpec { + val database: String + val testDesc: String + // Randomizer for tests + val r: Random = new Random(1234) + + def test(): Unit +} diff --git a/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala b/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala index facd28ea84..d74f67cf0a 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala @@ -26,7 +26,6 @@ import com.pingcap.tispark.TiDBUtils import com.pingcap.tispark.statistics.StatisticsManager import org.apache.spark.internal.Logging import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.util.resourceToString import org.apache.spark.sql.test.TestConstants._ import org.apache.spark.sql.test.Utils._ import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/Data.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/Data.scala index df3779a679..a91974790c 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/Data.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/Data.scala @@ -1,4 +1,5 @@ /* + * * Copyright 2019 PingCAP, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -11,6 +12,7 @@ * distributed under the License is distributed on an "AS IS" BASIS, * See the License for the specific language governing permissions and * limitations under the License. + * */ package org.apache.spark.sql.test.generator diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/DataType.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/DataType.scala index d118f17f46..b78d47fe7f 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/DataType.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/DataType.scala @@ -1,4 +1,5 @@ /* + * * Copyright 2019 PingCAP, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -11,6 +12,7 @@ * distributed under the License is distributed on an "AS IS" BASIS, * See the License for the specific language governing permissions and * limitations under the License. + * */ package org.apache.spark.sql.test.generator diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/Index.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/Index.scala index e8287c9810..81a7c265ae 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/Index.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/Index.scala @@ -1,4 +1,5 @@ /* + * * Copyright 2019 PingCAP, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -11,6 +12,7 @@ * distributed under the License is distributed on an "AS IS" BASIS, * See the License for the specific language governing permissions and * limitations under the License. + * */ package org.apache.spark.sql.test.generator diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/IndexColumn.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/IndexColumn.scala index a1f414b794..815f246244 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/IndexColumn.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/IndexColumn.scala @@ -1,4 +1,5 @@ /* + * * Copyright 2019 PingCAP, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -11,6 +12,7 @@ * distributed under the License is distributed on an "AS IS" BASIS, * See the License for the specific language governing permissions and * limitations under the License. + * */ package org.apache.spark.sql.test.generator @@ -54,14 +56,15 @@ case class ColumnInfo(columnName: String, } } - private val baseType = getBaseType(dataType) - - private val (len, decimal): (Long, Int) = if (length._1 == null) { - (getLength(baseType), getDecimal(baseType)) - } else if (length._2 == null) { - (length._1.toLong, getDecimal(baseType)) - } else { - (length._1.toLong, length._2) + val (len, decimal): (Long, Int) = { + val baseType = getBaseType(dataType) + if (length._1 == null) { + (getLength(baseType), getDecimal(baseType)) + } else if (length._2 == null) { + (length._1.toLong, getDecimal(baseType)) + } else { + (length._1.toLong, length._2) + } } { diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/Schema.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/Schema.scala index 33219056e2..f267444b0e 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/Schema.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/Schema.scala @@ -1,4 +1,5 @@ /* + * * Copyright 2019 PingCAP, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -11,6 +12,7 @@ * distributed under the License is distributed on an "AS IS" BASIS, * See the License for the specific language governing permissions and * limitations under the License. + * */ package org.apache.spark.sql.test.generator diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/TestDataGenerator.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/TestDataGenerator.scala index 6c543444f0..e21c17a9fb 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/TestDataGenerator.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/TestDataGenerator.scala @@ -1,4 +1,5 @@ /* + * * Copyright 2019 PingCAP, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -11,6 +12,7 @@ * distributed under the License is distributed on an "AS IS" BASIS, * See the License for the specific language governing permissions and * limitations under the License. + * */ package org.apache.spark.sql.test.generator @@ -268,29 +270,82 @@ object TestDataGenerator { offset: Int, r: Random, valueGenerator: ValueGenerator): Unit = { - if (valueGenerator.isPrimaryKey) { - assert(!valueGenerator.nullable, "Generate fails: Value cannot be null for primary key") - val value = valueGenerator.randomUniqueValue(r) - row.set(offset, valueGenerator.tiDataType, value) + val value = valueGenerator.next(r) + if (value == null) { + row.setNull(offset) } else { - if (valueGenerator.randomNull(r)) { - row.setNull(offset) - } else { - val value = valueGenerator.randomValue(r) - row.set(offset, valueGenerator.tiDataType, value) + row.set(offset, valueGenerator.tiDataType, value) + } + } + + def hash(value: Any): String = value match { + case null => "null" + case b: Array[Byte] => b.mkString("[", ",", "]") + case list: List[Any] => + val ret = StringBuilder.newBuilder + ret ++= "(" + for (i <- list.indices) { + if (i > 0) ret ++= "," + ret ++= hash(list(i)) } + ret ++= ")" + ret.toString + case x => x.toString + } + + def checkUnique(value: Any, set: mutable.Set[Any]): Boolean = { + val hashedValue = hash(value) + if (!set.apply(hashedValue)) { + set += hashedValue + true + } else { + false } } - private def generateRandomRows(schema: Schema, n: Long, r: Random): List[TiRow] = { - (1.toLong to n).map { _ => - val length = schema.columnInfo.length - val row: TiRow = ObjectRowImpl.create(length) + private def generateRandomRow(schema: Schema, + r: Random, + pkOffset: List[Int], + set: mutable.Set[Any]): TiRow = { + val length = schema.columnInfo.length + val row: TiRow = ObjectRowImpl.create(length) + while (true) { for (i <- schema.columnInfo.indices) { val columnInfo = schema.columnInfo(i) generateRandomValue(row, i, r, columnInfo.generator) } - row + if (pkOffset.nonEmpty) { + val value = pkOffset.map { i => + row.get(i, schema.columnInfo(i).generator.tiDataType) + } + if (checkUnique(value, set)) { + return row + } + } else { + return row + } + } + throw new RuntimeException("Inaccessible") + } + + private def generateRandomRows(schema: Schema, n: Long, r: Random): List[TiRow] = { + val set: mutable.Set[Any] = mutable.HashSet.empty[Any] + // offset of pk columns + val pkOffset: List[Int] = { + val primary = schema.indexInfo.filter(_.isPrimary) + if (primary.nonEmpty && primary.size == 1) { + primary.head.indexColumns.map(x => schema.columnNames.indexOf(x.column)) + } else { + List.empty[Int] + } + } + schema.columnInfo.foreach { col => + col.generator.reset() + col.generator.preGenerateRandomValues(r, n) + } + + (1.toLong to n).map { _ => + generateRandomRow(schema, r, pkOffset, set) }.toList } diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/ValueGenerator.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/ValueGenerator.scala index f7a290f1fc..7af9bd6935 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/ValueGenerator.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/ValueGenerator.scala @@ -1,4 +1,5 @@ /* + * * Copyright 2019 PingCAP, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -11,12 +12,13 @@ * distributed under the License is distributed on an "AS IS" BASIS, * See the License for the specific language governing permissions and * limitations under the License. + * */ package org.apache.spark.sql.test.generator import org.apache.spark.sql.test.generator.DataType._ -import org.apache.spark.sql.test.generator.TestDataGenerator.{getLength, isNumeric} +import org.apache.spark.sql.test.generator.TestDataGenerator.{checkUnique, getLength, isNumeric, isStringType} import scala.collection.mutable import scala.util.Random @@ -52,23 +54,76 @@ case class ValueGenerator(dataType: ReflectedDataType, import com.pingcap.tikv.meta.Collation._ val tiDataType: TiDataType = getType(dataType, flag, M, D, "", DEF_COLLATION_CODE) + val rangeSize: Long = dataType match { + case BIT => 1 << tiDataType.getLength.toInt + case BOOLEAN => 1 << 1 + case TINYINT => 1 << 8 + case SMALLINT => 1 << 16 + case MEDIUMINT => 1 << 24 + case INT => 1L << 32 + // just treat the range size as infinity, the value is meaningless + case _ => Long.MaxValue + } + + private var generatedRandomValues: List[Any] = List.empty[Any] + private var curPos = 0 + + ////////////////// Calculate Type Bound ////////////////// + private val lowerBound: Any = { + if (tiDataType.isUnsigned) { + dataType match { + case TINYINT | SMALLINT | MEDIUMINT | INT | BIGINT => 0L + case _ => null + } + } else { + dataType match { + case TINYINT | SMALLINT | MEDIUMINT | INT | BIGINT => tiDataType.signedLowerBound() + case _ => null + } + } + } + + private val upperBound: Any = { + if (tiDataType.isUnsigned) { + dataType match { + case TINYINT | SMALLINT | MEDIUMINT | INT => tiDataType.unsignedUpperBound() + case BIGINT => toUnsignedBigInt(tiDataType.unsignedUpperBound()) + case _ => null + } + } else { + dataType match { + case TINYINT | SMALLINT | MEDIUMINT | INT | BIGINT => tiDataType.signedUpperBound() + case _ => null + } + } + } + + private val specialBound: List[String] = { + val list: List[String] = dataType match { + case BIT => List("b\'\'", "\'\'") + case TINYINT | SMALLINT | MEDIUMINT | INT | BIGINT if !tiDataType.isUnsigned => List("-1") + case _ if isStringType(dataType) => List("") + case _ => List.empty[String] + } + if (lowerBound != null && upperBound != null) { + list ::: List(lowerBound.toString, upperBound.toString) + } else { + list + } + } + + def toUnsignedBigInt(l: Long): BigInt = BigInt.long2bigInt(l) - BigInt.long2bigInt(Long.MinValue) + + ////////////////// Generate Random Value ////////////////// def randomNull(r: Random): Boolean = { // 5% of non-null data be null !tiDataType.isNotNull && r.nextInt(20) == 0 } - val set: mutable.Set[Any] = mutable.HashSet.empty[Any] - - def randomUniqueValue(r: Random): Any = { + def randomUniqueValue(r: Random, set: mutable.Set[Any]): Any = { while (true) { val value = randomValue(r) - val hashedValue = value match { - case null => "null" - case b: Array[Byte] => b.mkString("[", ",", "]") - case x => x.toString - } - if (!set.apply(hashedValue)) { - set += hashedValue + if (checkUnique(value, set)) { return value } } @@ -88,7 +143,7 @@ case class ValueGenerator(dataType: ReflectedDataType, case SMALLINT => r.nextInt(1 << 16) case MEDIUMINT => r.nextInt(1 << 24) case INT => r.nextInt() + (1L << 31) - case BIGINT => BigInt.long2bigInt(r.nextLong()) - BigInt.long2bigInt(Long.MinValue) + case BIGINT => toUnsignedBigInt(r.nextLong()) case FLOAT => Math.abs(r.nextFloat()) case DOUBLE => Math.abs(r.nextDouble()) case DECIMAL => @@ -151,6 +206,62 @@ case class ValueGenerator(dataType: ReflectedDataType, b } + // pre-generate n random values + def preGenerateRandomValues(r: Random, n: Long): Unit = { + if (n <= 1e6) { + generatedRandomValues = if (isPrimaryKey) { + val set: mutable.Set[Any] = mutable.HashSet.empty[Any] + set += specialBound + (0L until n - specialBound.size).map { _ => + randomUniqueValue(r, set) + }.toList ++ specialBound + } else { + (0L until n - specialBound.size).map { _ => + randomValue(r) + }.toList ++ specialBound + } + assert(generatedRandomValues.size == n) + curPos = 0 + } + } + + ////////////////// Iterator ////////////////// + def next(r: Random): Any = { + if (randomNull(r)) { + null + } else { + if (generatedRandomValues.isEmpty) { + if (isPrimaryKey) { + val set: mutable.Set[Any] = mutable.HashSet.empty[Any] + randomUniqueValue(r, set) + } else { + randomValue(r) + } + } else { + next + } + } + } + + def hasNext: Boolean = curPos < generatedRandomValues.size + + def next: Any = { + assert( + generatedRandomValues.nonEmpty, + "Values not pre-generated, please generate values first to use next()" + ) + assert( + hasNext, + s"Generated random values(${generatedRandomValues.size}) is less than needed(${curPos + 1})." + ) + curPos += 1 + generatedRandomValues(curPos - 1) + } + + def reset(): Unit = { + curPos = 0 + } + ////////////////// To Description String ////////////////// private val typeDescString: String = dataType match { case BOOLEAN => "" diff --git a/core/src/test/scala/org/apache/spark/sql/types/BaseGenerateDataType.scala b/core/src/test/scala/org/apache/spark/sql/types/BaseDataTypeGenerator.scala similarity index 71% rename from core/src/test/scala/org/apache/spark/sql/types/BaseGenerateDataType.scala rename to core/src/test/scala/org/apache/spark/sql/types/BaseDataTypeGenerator.scala index b528691435..aa1a46e61b 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/BaseGenerateDataType.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/BaseDataTypeGenerator.scala @@ -1,6 +1,6 @@ /* * - * Copyright 2017 PingCAP, Inc. + * Copyright 2019 PingCAP, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,11 +20,11 @@ package org.apache.spark.sql.types import org.apache.spark.sql.BaseTiSparkTest import org.apache.spark.sql.test.generator.DataType.ReflectedDataType -case class BaseGenerateDataType(dataTypes: List[ReflectedDataType], - unsignedDataTypes: List[ReflectedDataType], - dataTypeTestDir: String, - database: String, - testDesc: String) +case class BaseDataTypeGenerator(dataTypes: List[ReflectedDataType], + unsignedDataTypes: List[ReflectedDataType], + dataTypeTestDir: String, + database: String, + testDesc: String) extends BaseTiSparkTest with GenerateUnitDataTypeTestAction { def loadTestData(typeName: String): Unit = { @@ -34,6 +34,6 @@ case class BaseGenerateDataType(dataTypes: List[ReflectedDataType], def loadUnsignedTestData(typeName: String): Unit = { logger.info(s"${preDescription}Test $extraDesc $typeName - $testDesc") - loadSQLFile(dataTypeTestDir, getTableName(typeName, extraDesc)) + loadSQLFile(dataTypeTestDir, getTableNameWithDesc(extraDesc, typeName)) } } diff --git a/core/src/test/scala/org/apache/spark/sql/types/BaseDataTypeTest.scala b/core/src/test/scala/org/apache/spark/sql/types/BaseDataTypeTest.scala index c0aee58f25..cca945b84f 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/BaseDataTypeTest.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/BaseDataTypeTest.scala @@ -1,19 +1,44 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + package org.apache.spark.sql.types -import org.apache.spark.sql.BaseTiSparkTest +import org.apache.spark.sql.{BaseTestGenerationSpec, BaseTiSparkTest} +import org.apache.spark.sql.test.SharedSQLContext trait BaseDataTypeTest extends BaseTiSparkTest { + + protected val generator: BaseTestGenerationSpec + def simpleSelect(dbName: String, dataType: String): Unit = { setCurrentDatabase(dbName) - val tblName = getTableName(dataType) - val query = s"select ${getColumnName(dataType)} from $tblName" + val tblName = generator.getTableName(dataType) + val query = s"select ${generator.getColumnName(dataType)} from $tblName" runTest(query) } def simpleSelect(dbName: String, dataType: String, desc: String): Unit = { setCurrentDatabase(dbName) - val tblName = getTableName(dataType, desc) - val query = s"select ${getColumnName(dataType)} from $tblName" + val tblName = generator.getTableNameWithDesc(desc, dataType) + val query = s"select ${generator.getColumnName(dataType)} from $tblName" + println(query) runTest(query) } + + // initialize test framework + SharedSQLContext.init() } diff --git a/core/src/test/scala/org/apache/spark/sql/types/BaseMultiColumnDataTypeGenerator.scala b/core/src/test/scala/org/apache/spark/sql/types/BaseMultiColumnDataTypeGenerator.scala new file mode 100644 index 0000000000..ef638906f3 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/BaseMultiColumnDataTypeGenerator.scala @@ -0,0 +1,35 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types + +import org.apache.spark.sql.BaseTiSparkTest +import org.apache.spark.sql.test.generator.DataType.{getTypeName, ReflectedDataType} + +case class BaseMultiColumnDataTypeGenerator(dataTypes: List[ReflectedDataType], + unsignedDataTypes: List[ReflectedDataType], + dataTypeTestDir: String, + database: String, + testDesc: String) + extends BaseTiSparkTest + with GenerateMultiColumnDataTypeTestAction { + def loadTestData(dataTypes: List[ReflectedDataType]): Unit = { + val tableName = getTableName(dataTypes.map(getTypeName): _*) + logger.info(s"${preDescription}Test $tableName - $testDesc") + loadSQLFile(dataTypeTestDir, tableName) + } +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/DataTypeExampleTest.scala b/core/src/test/scala/org/apache/spark/sql/types/DataTypeExampleTest.scala index 155b89eca5..a8606061e9 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/DataTypeExampleTest.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/DataTypeExampleTest.scala @@ -1,6 +1,6 @@ /* * - * Copyright 2017 PingCAP, Inc. + * Copyright 2019 PingCAP, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,6 @@ package org.apache.spark.sql.types -import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.test.generator.DataType.{ReflectedDataType, TINYINT} class DataTypeExampleTest extends BaseDataTypeTest with RunUnitDataTypeTestAction { @@ -27,22 +26,24 @@ class DataTypeExampleTest extends BaseDataTypeTest with RunUnitDataTypeTestActio val database: String = "data_type_test_example" val testDesc: String = "Base test for data types" + override lazy protected val generator = + BaseDataTypeGenerator(dataTypes, unsignedDataTypes, dataTypeTestDir, database, testDesc) + def startTest(typeName: String): Unit = { - test(s"${preDescription}Test $typeName - $testDesc") { + test(s"Test $typeName - $testDesc") { simpleSelect(database, typeName) } } def startUnsignedTest(typeName: String): Unit = { - test(s"${preDescription}Test $extraDesc $typeName - $testDesc") { + test(s"Test $extraDesc $typeName - $testDesc") { simpleSelect(database, typeName, extraDesc) } } def check(): Unit = { - SharedSQLContext.init() if (generateData) { - BaseGenerateDataType(dataTypes, unsignedDataTypes, dataTypeTestDir, database, testDesc).test() + generator.test() } } diff --git a/core/src/test/scala/org/apache/spark/sql/types/DataTypeNormalSuite.scala b/core/src/test/scala/org/apache/spark/sql/types/DataTypeNormalSuite.scala index 799bcf3e25..396c50744d 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/DataTypeNormalSuite.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/DataTypeNormalSuite.scala @@ -1,4 +1,5 @@ /* + * * Copyright 2019 PingCAP, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -11,11 +12,11 @@ * distributed under the License is distributed on an "AS IS" BASIS, * See the License for the specific language governing permissions and * limitations under the License. + * */ package org.apache.spark.sql.types -import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.test.generator.DataType._ import org.apache.spark.sql.test.generator.TestDataGenerator._ @@ -26,22 +27,24 @@ class DataTypeNormalSuite extends BaseDataTypeTest with RunUnitDataTypeTestActio override val database = "data_type_test" override val testDesc = "Test for single column data types (and unsigned types)" + override lazy protected val generator = + BaseDataTypeGenerator(dataTypes, unsignedDataTypes, dataTypeTestDir, database, testDesc) + def startTest(typeName: String): Unit = { - test(s"${preDescription}Test $typeName - $testDesc") { + test(s"Test $typeName - $testDesc") { simpleSelect(database, typeName) } } def startUnsignedTest(typeName: String): Unit = { - test(s"${preDescription}Test $extraDesc $typeName - $testDesc") { + test(s"Test $extraDesc $typeName - $testDesc") { simpleSelect(database, typeName, extraDesc) } } def check(): Unit = { - SharedSQLContext.init() if (generateData) { - BaseGenerateDataType(dataTypes, unsignedDataTypes, dataTypeTestDir, database, testDesc).test() + generator.test() } } diff --git a/core/src/test/scala/org/apache/spark/sql/types/GenerateMultiColumnDataTypeTestAction.scala b/core/src/test/scala/org/apache/spark/sql/types/GenerateMultiColumnDataTypeTestAction.scala new file mode 100644 index 0000000000..6ccb3f79fa --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/GenerateMultiColumnDataTypeTestAction.scala @@ -0,0 +1,79 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types + +import org.apache.spark.sql.BaseTestGenerationSpec +import org.apache.spark.sql.test.generator.DataType.{getBaseType, getTypeName, DECIMAL, ReflectedDataType} +import org.apache.spark.sql.test.generator.{Data, Index, Schema} +import org.apache.spark.sql.test.generator.TestDataGenerator.{getDecimal, getLength, isCharOrBinary, isVarString, randomDataGenerator, schemaGenerator} + +trait GenerateMultiColumnDataTypeTestAction + extends MultiColumnDataTypeTestSpec + with BaseTestGenerationSpec { + + override val rowCount = 50 + + private def toString(dataTypes: Seq[String]): String = dataTypes.hashCode().toString + + override def getTableName(dataTypes: String*): String = s"test_${toString(dataTypes)}" + + override def getTableNameWithDesc(desc: String, dataTypes: String*): String = + s"test_${desc}_${toString(dataTypes)}" + + def genSchema(tableName: String, + dataTypesWithDescription: List[(ReflectedDataType, String, String)]): Schema = { + schemaGenerator( + database, + tableName, + r, + dataTypesWithDescription, + List.empty[Index] + ) + } + + def genData(schema: Schema): Data = randomDataGenerator(schema, rowCount, dataTypeTestDir, r) + + def genLen(dataType: ReflectedDataType): String = { + val baseType = getBaseType(dataType) + val length = getLength(baseType) + dataType match { + case DECIMAL => s"$length,${getDecimal(baseType)}" + case _ if isVarString(dataType) => s"$length" + case _ if isCharOrBinary(dataType) => "10" + case _ => "" + } + } + + def init(): Unit = { + val tableName = getTableName(dataTypes.map(getTypeName): _*) + val dataTypesWithDescription = dataTypes.map { dataType => + val len = genLen(dataType) + (dataType, len, "") + } + val schema = genSchema(tableName, dataTypesWithDescription) + val data = genData(schema) + data.save() + } + + def loadTestData(dataTypes: List[ReflectedDataType]): Unit + + def test(): Unit = { + init() + loadTestData(dataTypes) + } +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/GenerateUnitDataTypeTestAction.scala b/core/src/test/scala/org/apache/spark/sql/types/GenerateUnitDataTypeTestAction.scala index 67f235e8f4..e6ca39a58b 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/GenerateUnitDataTypeTestAction.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/GenerateUnitDataTypeTestAction.scala @@ -1,6 +1,6 @@ /* * - * Copyright 2017 PingCAP, Inc. + * Copyright 2019 PingCAP, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,9 +22,21 @@ import org.apache.spark.sql.test.generator.{Data, Index, Schema} import org.apache.spark.sql.test.generator.DataType.{getBaseType, getTypeName, DECIMAL, ReflectedDataType} import org.apache.spark.sql.test.generator.TestDataGenerator.{getDecimal, getLength, isCharOrBinary, isVarString, randomDataGenerator, schemaGenerator} -trait GenerateUnitDataTypeTestAction extends UnitDataTypeTestAction with BaseTestGenerationSpec { +trait GenerateUnitDataTypeTestAction extends UnitDataTypeTestSpec with BaseTestGenerationSpec { - override val preDescription: String = "Generating Data for " + override val rowCount = 50 + + private def toString(dataTypes: Seq[String]) = { + assert(dataTypes.size == 1, "Unit data type tests can not manage multiple columns") + dataTypes.mkString("_") + } + + override def getTableName(dataTypes: String*): String = s"test_${toString(dataTypes)}" + + override def getTableNameWithDesc(desc: String, dataTypes: String*): String = + s"test_${desc}_${toString(dataTypes)}" + + override def getIndexName(dataTypes: String*): String = s"idx_${toString(dataTypes)}" def genSchema(dataType: ReflectedDataType, tableName: String, @@ -66,7 +78,7 @@ trait GenerateUnitDataTypeTestAction extends UnitDataTypeTestAction with BaseTes for (dataType <- unsignedDataTypes) { val typeName = getTypeName(dataType) val len = genLen(dataType) - val tableName = getTableName(typeName, extraDesc) + val tableName = getTableNameWithDesc(extraDesc, typeName) val schema = genSchema(dataType, tableName, len, extraDesc) val data = genData(schema) data.save() diff --git a/core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeSuite.scala b/core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeSuite.scala new file mode 100644 index 0000000000..c24e5220aa --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeSuite.scala @@ -0,0 +1,52 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types + +import org.apache.spark.sql.test.generator.DataType._ +import org.apache.spark.sql.test.generator.TestDataGenerator._ + +class MultiColumnDataTypeSuite + extends MultiColumnDataTypeTest + with RunMultiColumnDataTypeTestAction { + val dataTypes: List[ReflectedDataType] = numeric ::: stringType + val unsignedDataTypes: List[ReflectedDataType] = numeric + val dataTypeTestDir: String = "multi-column-dataType-test" + val database: String = "multi_column_data_type_test" + val testDesc: String = "Base test for multi-column data types" + + override val generator = BaseMultiColumnDataTypeGenerator( + dataTypes, + unsignedDataTypes, + dataTypeTestDir, + database, + testDesc + ) + + def startTest(dataTypes: List[ReflectedDataType]): Unit = { + simpleSelect(database, dataTypes: _*) + } + + def check(): Unit = { + if (generateData) { + generator.test() + } + } + + check() + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeTest.scala b/core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeTest.scala new file mode 100644 index 0000000000..af4cef6e6f --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeTest.scala @@ -0,0 +1,76 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types + +import org.apache.spark.sql.{BaseTestGenerationSpec, BaseTiSparkTest} +import org.apache.spark.sql.test.SharedSQLContext +import org.apache.spark.sql.test.generator.DataType._ +import org.apache.spark.sql.test.generator.TestDataGenerator._ + +trait MultiColumnDataTypeTest extends BaseTiSparkTest { + + protected val generator: BaseTestGenerationSpec + + private val cmps: List[String] = List(">", "<") + private val eqs: List[String] = List("=", "<>") + + implicit class C[X](xs: Traversable[X]) { + def cross[Y](ys: Traversable[Y]): Traversable[(X, Y)] = for { x <- xs; y <- ys } yield (x, y) + } + + def getOperations(dataType: ReflectedDataType): List[(String, String)] = + List(("is", "null")) ++ { + (cmps ++ eqs) cross { + dataType match { + case TINYINT => List("1", "0") + case _ if isNumeric(dataType) => List("1", "2333") + case _ if isStringType(dataType) => List("\'PingCAP\'", "\'\'") + case _ => List.empty[String] + } + } + } ++ { + eqs cross { + dataType match { + case BOOLEAN => List("false", "true") + case _ => List.empty[String] + } + } + } + + def simpleSelect(dbName: String, dataTypes: ReflectedDataType*): Unit = { + val typeNames = dataTypes.map(getTypeName) + val tblName = generator.getTableName(typeNames: _*) + val columnNames = typeNames.map(generator.getColumnName) + for (i <- columnNames.indices) { + for (j <- i + 1 until columnNames.size) { + val col = columnNames(j) + val types = dataTypes(j) + for ((op, value) <- getOperations(types)) { + val query = s"select ${columnNames(i)} from $tblName where $col $op $value" + test(query) { + setCurrentDatabase(dbName) + runTest(query) + } + } + } + } + } + + // initialize test framework + SharedSQLContext.init() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/UnitDataTypeTestAction.scala b/core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeTestSpec.scala similarity index 67% rename from core/src/test/scala/org/apache/spark/sql/types/UnitDataTypeTestAction.scala rename to core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeTestSpec.scala index 894445d36b..033f88bc97 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/UnitDataTypeTestAction.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeTestSpec.scala @@ -1,6 +1,6 @@ /* * - * Copyright 2017 PingCAP, Inc. + * Copyright 2019 PingCAP, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,20 +17,13 @@ package org.apache.spark.sql.types -import org.apache.spark.sql.test.generator.DataType.{ReflectedDataType, TINYINT} +import org.apache.spark.sql.TiSparkTestSpec +import org.apache.spark.sql.test.generator.DataType.ReflectedDataType -import scala.util.Random - -trait UnitDataTypeTestAction { +trait MultiColumnDataTypeTestSpec extends TiSparkTestSpec { val dataTypes: List[ReflectedDataType] val unsignedDataTypes: List[ReflectedDataType] val dataTypeTestDir: String - val database: String - val testDesc: String - // Randomizer for tests - val r: Random = new Random(1234) - val extraDesc = "unsigned" - val preDescription: String = "" - def test(): Unit + val extraDesc = "unsigned" } diff --git a/core/src/test/scala/org/apache/spark/sql/types/RunMultiColumnDataTypeTestAction.scala b/core/src/test/scala/org/apache/spark/sql/types/RunMultiColumnDataTypeTestAction.scala new file mode 100644 index 0000000000..1debe17094 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/RunMultiColumnDataTypeTestAction.scala @@ -0,0 +1,29 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types + +import org.apache.spark.sql.test.generator.DataType.ReflectedDataType + +trait RunMultiColumnDataTypeTestAction extends MultiColumnDataTypeTestSpec { + + def startTest(dataTypes: List[ReflectedDataType]): Unit + + def test(): Unit = { + startTest(dataTypes) + } +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/RunUnitDataTypeTestAction.scala b/core/src/test/scala/org/apache/spark/sql/types/RunUnitDataTypeTestAction.scala index ee67a4bde5..308cead24f 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/RunUnitDataTypeTestAction.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/RunUnitDataTypeTestAction.scala @@ -1,6 +1,6 @@ /* * - * Copyright 2017 PingCAP, Inc. + * Copyright 2019 PingCAP, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ package org.apache.spark.sql.types import org.apache.spark.sql.test.generator.DataType.getTypeName -trait RunUnitDataTypeTestAction extends UnitDataTypeTestAction { +trait RunUnitDataTypeTestAction extends UnitDataTypeTestSpec { def startTest(typeName: String): Unit diff --git a/core/src/test/scala/org/apache/spark/sql/types/SpecialTiDBTypeTestSuite.scala b/core/src/test/scala/org/apache/spark/sql/types/SpecialTiDBTypeTestSuite.scala index 87cb623650..c682960656 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/SpecialTiDBTypeTestSuite.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/SpecialTiDBTypeTestSuite.scala @@ -1,6 +1,6 @@ /* * - * Copyright 2017 PingCAP, Inc. + * Copyright 2019 PingCAP, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/core/src/test/scala/org/apache/spark/sql/types/UnitDataTypeTestSpec.scala b/core/src/test/scala/org/apache/spark/sql/types/UnitDataTypeTestSpec.scala new file mode 100644 index 0000000000..5d87907b70 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/UnitDataTypeTestSpec.scala @@ -0,0 +1,29 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types + +import org.apache.spark.sql.TiSparkTestSpec +import org.apache.spark.sql.test.generator.DataType.ReflectedDataType + +trait UnitDataTypeTestSpec extends TiSparkTestSpec { + val dataTypes: List[ReflectedDataType] + val unsignedDataTypes: List[ReflectedDataType] + val dataTypeTestDir: String + + val extraDesc = "unsigned" +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/DataTypePKGenerator.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/DataTypePKGenerator.scala similarity index 83% rename from core/src/test/scala/org/apache/spark/sql/types/DataTypePKGenerator.scala rename to core/src/test/scala/org/apache/spark/sql/types/pk/DataTypePKGenerator.scala index 9630ef510a..01a0fc2a09 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/DataTypePKGenerator.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/DataTypePKGenerator.scala @@ -1,6 +1,6 @@ /* * - * Copyright 2017 PingCAP, Inc. + * Copyright 2019 PingCAP, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,10 +15,11 @@ * */ -package org.apache.spark.sql.types +package org.apache.spark.sql.types.pk import org.apache.spark.sql.BaseTiSparkTest import org.apache.spark.sql.test.generator.DataType.ReflectedDataType +import org.apache.spark.sql.types.UnitDataTypeTestSpec case class DataTypePKGenerator(dataTypes: List[ReflectedDataType], unsignedDataTypes: List[ReflectedDataType], @@ -34,12 +35,12 @@ case class DataTypePKGenerator(dataTypes: List[ReflectedDataType], def loadUnsignedTestData(typeName: String): Unit = { logger.info(s"${preDescription}Test $extraDesc $typeName - $testDesc") - loadSQLFile(dataTypeTestDir, getTableName(typeName, extraDesc)) + loadSQLFile(dataTypeTestDir, getTableNameWithDesc(extraDesc, typeName)) } } object DataTypePKGenerator { - def apply(u: UnitDataTypeTestAction): GeneratePKDataTypeTestAction = { + def apply(u: UnitDataTypeTestSpec): GeneratePKDataTypeTestAction = { DataTypePKGenerator(u.dataTypes, u.unsignedDataTypes, u.dataTypeTestDir, u.database, u.testDesc) } } diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/DataTypePKSuite.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/DataTypePKSuite.scala index cce1ddd265..fa992da135 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/pk/DataTypePKSuite.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/DataTypePKSuite.scala @@ -1,4 +1,5 @@ /* + * * Copyright 2019 PingCAP, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -11,14 +12,14 @@ * distributed under the License is distributed on an "AS IS" BASIS, * See the License for the specific language governing permissions and * limitations under the License. + * */ package org.apache.spark.sql.types.pk -import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.test.generator.DataType._ import org.apache.spark.sql.test.generator.TestDataGenerator._ -import org.apache.spark.sql.types.{BaseDataTypeTest, DataTypePKGenerator, RunUnitDataTypeTestAction} +import org.apache.spark.sql.types.{BaseDataTypeTest, RunUnitDataTypeTestAction} class DataTypePKSuite extends BaseDataTypeTest with RunUnitDataTypeTestAction { override val dataTypes: List[ReflectedDataType] = integers ::: decimals ::: doubles ::: stringType @@ -27,22 +28,24 @@ class DataTypePKSuite extends BaseDataTypeTest with RunUnitDataTypeTestAction { override val database = "data_type_test_pk" override val testDesc = "Test for single PK column data types (and unsigned types)" + override lazy protected val generator = + DataTypePKGenerator(dataTypes, unsignedDataTypes, dataTypeTestDir, database, testDesc) + def startTest(typeName: String): Unit = { - test(s"${preDescription}Test $typeName - $testDesc") { + test(s"Test $typeName - $testDesc") { simpleSelect(database, typeName) } } def startUnsignedTest(typeName: String): Unit = { - test(s"${preDescription}Test $extraDesc $typeName - $testDesc") { + test(s"Test $extraDesc $typeName - $testDesc") { simpleSelect(database, typeName, extraDesc) } } def check(): Unit = { - SharedSQLContext.init() if (generateData) { - DataTypePKGenerator.apply(this).test() + generator.test() } } diff --git a/core/src/test/scala/org/apache/spark/sql/types/GeneratePKDataTypeTestAction.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/GeneratePKDataTypeTestAction.scala similarity index 88% rename from core/src/test/scala/org/apache/spark/sql/types/GeneratePKDataTypeTestAction.scala rename to core/src/test/scala/org/apache/spark/sql/types/pk/GeneratePKDataTypeTestAction.scala index 443d177612..8ba2968f60 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/GeneratePKDataTypeTestAction.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/GeneratePKDataTypeTestAction.scala @@ -1,6 +1,6 @@ /* * - * Copyright 2017 PingCAP, Inc. + * Copyright 2019 PingCAP, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,11 +15,12 @@ * */ -package org.apache.spark.sql.types +package org.apache.spark.sql.types.pk import org.apache.spark.sql.test.generator.DataType.ReflectedDataType import org.apache.spark.sql.test.generator.TestDataGenerator.{isStringType, schemaGenerator} -import org.apache.spark.sql.test.generator.{DefaultColumn, Index, PrefixColumn, PrimaryKey, Schema} +import org.apache.spark.sql.test.generator._ +import org.apache.spark.sql.types.GenerateUnitDataTypeTestAction import scala.util.Random From 49ac6c52b1b6bb63c6a85845237ff66021f9da8a Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Tue, 30 Jul 2019 16:06:27 +0800 Subject: [PATCH 05/62] fix range partition throw UnsupportedSyntaxException error (#960) --- .../spark/sql/PartitionTableSuite.scala | 28 +++++++++++++++++++ .../visitor/PrunedPartitionBuilder.java | 3 +- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/core/src/test/scala/org/apache/spark/sql/PartitionTableSuite.scala b/core/src/test/scala/org/apache/spark/sql/PartitionTableSuite.scala index 60085cea21..1f758adb5b 100644 --- a/core/src/test/scala/org/apache/spark/sql/PartitionTableSuite.scala +++ b/core/src/test/scala/org/apache/spark/sql/PartitionTableSuite.scala @@ -21,6 +21,34 @@ import org.apache.spark.sql.execution.{CoprocessorRDD, RegionTaskExec} class PartitionTableSuite extends BaseTiSparkTest { def enablePartitionForTiDB(): Boolean = tidbStmt.execute("set @@tidb_enable_table_partition = 1") + test("test read from range partition and partition function (mod) is not supported by tispark") { + enablePartitionForTiDB() + tidbStmt.execute("DROP TABLE IF EXISTS `pt`") + tidbStmt.execute(""" + |CREATE TABLE `pt` ( + | `id` int(11) DEFAULT NULL, + | `name` varchar(50) DEFAULT NULL, + | `purchased` date DEFAULT NULL, + | index `idx_id`(`id`) + |) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin + |PARTITION BY RANGE (mod(year(purchased), 4)) ( + | PARTITION p0 VALUES LESS THAN (1), + | PARTITION p1 VALUES LESS THAN (2), + | PARTITION p2 VALUES LESS THAN (3), + | PARTITION p3 VALUES LESS THAN (MAXVALUE) + |) + """.stripMargin) + + tidbStmt.execute("insert into `pt` values(1, 'name', '1995-10-10')") + refreshConnections() + + judge("select * from pt") + judge("select * from pt where name = 'name'") + judge("select * from pt where name != 'name'") + judge("select * from pt where purchased = date'1995-10-10'") + judge("select * from pt where purchased != date'1995-10-10'") + } + test("constant folding does not apply case") { enablePartitionForTiDB() tidbStmt.execute( diff --git a/tikv-client/src/main/java/com/pingcap/tikv/expression/visitor/PrunedPartitionBuilder.java b/tikv-client/src/main/java/com/pingcap/tikv/expression/visitor/PrunedPartitionBuilder.java index 7b390b834b..870afd8058 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/expression/visitor/PrunedPartitionBuilder.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/expression/visitor/PrunedPartitionBuilder.java @@ -18,6 +18,7 @@ import com.google.common.collect.RangeSet; import com.google.common.collect.TreeRangeSet; import com.pingcap.tikv.exception.UnsupportedPartitionExprException; +import com.pingcap.tikv.exception.UnsupportedSyntaxException; import com.pingcap.tikv.expression.*; import com.pingcap.tikv.expression.ComparisonBinaryExpression.NormalizedPredicate; import com.pingcap.tikv.meta.TiPartitionDef; @@ -71,7 +72,7 @@ private static boolean canBePruned(TiTableInfo tblInfo, Expression filter) { try { partExprs = generateRangePartExprs(tblInfo); - } catch (UnsupportedPartitionExprException e) { + } catch (UnsupportedSyntaxException | UnsupportedPartitionExprException e) { return false; } From 0fb1b6fe5de47441653dcbfeecf7ba1325ce8770 Mon Sep 17 00:00:00 2001 From: Zhexuan Yang Date: Tue, 30 Jul 2019 19:47:47 +0800 Subject: [PATCH 06/62] fix view parsing problem (#953) --- .../org/apache/spark/sql/ViewTestSuite.scala | 24 ++++++++++++ .../com/pingcap/tikv/catalog/Catalog.java | 10 ++++- .../com/pingcap/tikv/meta/TiTableInfo.java | 13 ++++++- .../com/pingcap/tikv/meta/TiUserIdentity.java | 28 ++++++++++++++ .../com/pingcap/tikv/meta/TiViewInfo.java | 38 +++++++++++++++++++ .../com/pingcap/tikv/types/DecimalType.java | 3 +- .../java/com/pingcap/tikv/meta/MetaUtils.java | 15 +++++++- 7 files changed, 125 insertions(+), 6 deletions(-) create mode 100644 core/src/test/scala/org/apache/spark/sql/ViewTestSuite.scala create mode 100644 tikv-client/src/main/java/com/pingcap/tikv/meta/TiUserIdentity.java create mode 100644 tikv-client/src/main/java/com/pingcap/tikv/meta/TiViewInfo.java diff --git a/core/src/test/scala/org/apache/spark/sql/ViewTestSuite.scala b/core/src/test/scala/org/apache/spark/sql/ViewTestSuite.scala new file mode 100644 index 0000000000..b116534711 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/ViewTestSuite.scala @@ -0,0 +1,24 @@ +package org.apache.spark.sql + +class ViewTestSuite extends BaseTiSparkTest { + private val table = "test_view" + + test("Test View") { + tidbStmt.execute(s"drop table if exists $table") + try { + tidbStmt.execute("drop view if exists v") + } catch { + case _: Exception => cancel + } + + tidbStmt.execute(s"create table $table(qty INT, price INT);") + + tidbStmt.execute(s"INSERT INTO $table VALUES(3, 50);") + tidbStmt.execute(s"CREATE VIEW v AS SELECT qty, price, qty*price AS value FROM $table;") + + refreshConnections() + + judge(s"select * from $table") + intercept[AnalysisException](spark.sql("select * from v")) + } +} \ No newline at end of file diff --git a/tikv-client/src/main/java/com/pingcap/tikv/catalog/Catalog.java b/tikv-client/src/main/java/com/pingcap/tikv/catalog/Catalog.java index 30a6b7a56f..05179ab609 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/catalog/Catalog.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/catalog/Catalog.java @@ -85,7 +85,9 @@ public List listTables(TiDBInfo db) { if (tableMap == null) { tableMap = loadTables(db); } - return ImmutableList.copyOf(tableMap.values()); + Collection tables = tableMap.values(); + tables.removeIf(TiTableInfo::isView); + return ImmutableList.copyOf(tables); } public TiTableInfo getTable(TiDBInfo db, String tableName) { @@ -93,7 +95,11 @@ public TiTableInfo getTable(TiDBInfo db, String tableName) { if (tableMap == null) { tableMap = loadTables(db); } - return tableMap.get(tableName.toLowerCase()); + TiTableInfo tbl = tableMap.get(tableName.toLowerCase()); + // https://github.com/pingcap/tispark/issues/961 + // TODO: support reading from view table in the future. + if (tbl != null && tbl.isView()) return null; + return tbl; } private Map loadTables(TiDBInfo db) { diff --git a/tikv-client/src/main/java/com/pingcap/tikv/meta/TiTableInfo.java b/tikv-client/src/main/java/com/pingcap/tikv/meta/TiTableInfo.java index 792ae976a7..7e863b088f 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/meta/TiTableInfo.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/meta/TiTableInfo.java @@ -47,8 +47,10 @@ public class TiTableInfo implements Serializable { private final long rowSize; // estimated row size private final TiPartitionInfo partitionInfo; private final TiColumnInfo primaryKeyColumn; + private final TiViewInfo viewInfo; @JsonCreator + @JsonIgnoreProperties(ignoreUnknown = true) public TiTableInfo( @JsonProperty("id") long id, @JsonProperty("name") CIStr name, @@ -62,7 +64,8 @@ public TiTableInfo( @JsonProperty("max_col_id") long maxColumnId, @JsonProperty("max_idx_id") long maxIndexId, @JsonProperty("old_schema_id") long oldSchemaId, - @JsonProperty("partition") TiPartitionInfo partitionInfo) { + @JsonProperty("partition") TiPartitionInfo partitionInfo, + @JsonProperty("view") TiViewInfo viewInfo) { this.id = id; this.name = name.getL(); this.charset = charset; @@ -79,6 +82,7 @@ public TiTableInfo( this.maxIndexId = maxIndexId; this.oldSchemaId = oldSchemaId; this.partitionInfo = partitionInfo; + this.viewInfo = viewInfo; TiColumnInfo primaryKey = null; for (TiColumnInfo col : this.columns) { @@ -90,6 +94,10 @@ public TiTableInfo( primaryKeyColumn = primaryKey; } + public boolean isView() { + return this.viewInfo != null; + } + // auto increment column must be a primary key column public boolean hasAutoIncrementColumn() { if (primaryKeyColumn != null) { @@ -239,7 +247,8 @@ public TiTableInfo copyTableWithRowId() { getMaxColumnId(), getMaxIndexId(), getOldSchemaId(), - partitionInfo); + partitionInfo, + null); } else { return this; } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/meta/TiUserIdentity.java b/tikv-client/src/main/java/com/pingcap/tikv/meta/TiUserIdentity.java new file mode 100644 index 0000000000..73b65c149c --- /dev/null +++ b/tikv-client/src/main/java/com/pingcap/tikv/meta/TiUserIdentity.java @@ -0,0 +1,28 @@ +package com.pingcap.tikv.meta; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import java.io.Serializable; + +// TiUserIdentity represents username and hostname. +public class TiUserIdentity implements Serializable { + private String username; + private String hostname; + private boolean currentUser; + private String authUsername; + private String authHostname; + + @JsonCreator + public TiUserIdentity( + @JsonProperty("Username") String userName, + @JsonProperty("Hostname") String hostName, + @JsonProperty("CurrentUser") boolean currentUser, + @JsonProperty("AuthUsername") String authUserName, + @JsonProperty("AuthHostname") String authHostName) { + this.authHostname = authHostName; + this.authUsername = authUserName; + this.hostname = hostName; + this.username = userName; + this.currentUser = currentUser; + } +} diff --git a/tikv-client/src/main/java/com/pingcap/tikv/meta/TiViewInfo.java b/tikv-client/src/main/java/com/pingcap/tikv/meta/TiViewInfo.java new file mode 100644 index 0000000000..d0aaad4079 --- /dev/null +++ b/tikv-client/src/main/java/com/pingcap/tikv/meta/TiViewInfo.java @@ -0,0 +1,38 @@ +package com.pingcap.tikv.meta; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import java.io.Serializable; +import java.util.List; +import java.util.stream.Collectors; + +public class TiViewInfo implements Serializable { + // ViewAlgorithm is VIEW's SQL AlGORITHM characteristic. + // See https://dev.mysql.com/doc/refman/5.7/en/view-algorithms.html + private final long viewAlgorithm; + private final TiUserIdentity userIdentity; + // ViewSecurity is VIEW's SQL SECURITY characteristic. + // See https://dev.mysql.com/doc/refman/5.7/en/create-view.html + private final long viewSecurity; + private final String viewSelect; + // ViewCheckOption is VIEW's WITH CHECK OPTION clause part. + // See https://dev.mysql.com/doc/refman/5.7/en/view-check-option.html + private final long viewCheckOpt; + private final List viewCols; + + @JsonCreator + public TiViewInfo( + @JsonProperty("view_algorithm") long viewAlgorithm, + @JsonProperty("view_definer") TiUserIdentity userIdentity, + @JsonProperty("view_security") long viewSecurity, + @JsonProperty("view_select") String viewSelect, + @JsonProperty("view_checkoption") long viewCheckOpt, + @JsonProperty("view_cols") List viewCols) { + this.viewAlgorithm = viewAlgorithm; + this.userIdentity = userIdentity; + this.viewSecurity = viewSecurity; + this.viewSelect = viewSelect; + this.viewCheckOpt = viewCheckOpt; + this.viewCols = viewCols.stream().map(CIStr::getO).collect(Collectors.toList()); + } +} diff --git a/tikv-client/src/main/java/com/pingcap/tikv/types/DecimalType.java b/tikv-client/src/main/java/com/pingcap/tikv/types/DecimalType.java index 1734ad6214..e1ce8b94d2 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/types/DecimalType.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/types/DecimalType.java @@ -32,7 +32,8 @@ public class DecimalType extends DataType { public static final DecimalType DECIMAL = new DecimalType(MySQLType.TypeNewDecimal); - public static final MySQLType[] subTypes = new MySQLType[] {MySQLType.TypeNewDecimal}; + public static final MySQLType[] subTypes = + new MySQLType[] {MySQLType.TypeNewDecimal, MySQLType.TypeDecimal}; private DecimalType(MySQLType tp) { super(tp); diff --git a/tikv-client/src/test/java/com/pingcap/tikv/meta/MetaUtils.java b/tikv-client/src/test/java/com/pingcap/tikv/meta/MetaUtils.java index f51e97dfdf..3057b6038e 100644 --- a/tikv-client/src/test/java/com/pingcap/tikv/meta/MetaUtils.java +++ b/tikv-client/src/test/java/com/pingcap/tikv/meta/MetaUtils.java @@ -130,7 +130,20 @@ public TiTableInfo build() { name = "Table" + tid; } return new TiTableInfo( - tid, CIStr.newCIStr(name), "", "", pkHandle, columns, indices, "", 0, 0, 0, 0, partInfo); + tid, + CIStr.newCIStr(name), + "", + "", + pkHandle, + columns, + indices, + "", + 0, + 0, + 0, + 0, + partInfo, + null); } } From 35f8781cd064e74bf45fce6adb7836dc3711aa3c Mon Sep 17 00:00:00 2001 From: Zhexuan Yang Date: Wed, 31 Jul 2019 13:52:15 +0800 Subject: [PATCH 07/62] make tispark can read from a hash partition table (#966) --- .../apache/spark/sql/PartitionTableSuite.scala | 17 +++++++++++++++++ .../com/pingcap/tikv/meta/TiPartitionDef.java | 7 ++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/core/src/test/scala/org/apache/spark/sql/PartitionTableSuite.scala b/core/src/test/scala/org/apache/spark/sql/PartitionTableSuite.scala index 1f758adb5b..4d28e2b13b 100644 --- a/core/src/test/scala/org/apache/spark/sql/PartitionTableSuite.scala +++ b/core/src/test/scala/org/apache/spark/sql/PartitionTableSuite.scala @@ -21,6 +21,23 @@ import org.apache.spark.sql.execution.{CoprocessorRDD, RegionTaskExec} class PartitionTableSuite extends BaseTiSparkTest { def enablePartitionForTiDB(): Boolean = tidbStmt.execute("set @@tidb_enable_table_partition = 1") + test("reading from hash partition") { + enablePartitionForTiDB() + tidbStmt.execute("drop table if exists t") + tidbStmt.execute( + """create table t (id int) partition by hash(id) PARTITIONS 4 + |""".stripMargin + ) + tidbStmt.execute("insert into `t` values(5)") + tidbStmt.execute("insert into `t` values(15)") + tidbStmt.execute("insert into `t` values(25)") + tidbStmt.execute("insert into `t` values(35)") + refreshConnections() + + judge("select * from t") + judge("select * from t where id < 10") + } + test("test read from range partition and partition function (mod) is not supported by tispark") { enablePartitionForTiDB() tidbStmt.execute("DROP TABLE IF EXISTS `pt`") diff --git a/tikv-client/src/main/java/com/pingcap/tikv/meta/TiPartitionDef.java b/tikv-client/src/main/java/com/pingcap/tikv/meta/TiPartitionDef.java index ead360ddd3..88350a4267 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/meta/TiPartitionDef.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/meta/TiPartitionDef.java @@ -21,6 +21,7 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; import java.io.Serializable; +import java.util.ArrayList; import java.util.List; @JsonIgnoreProperties(ignoreUnknown = true) @@ -39,7 +40,11 @@ public TiPartitionDef( @JsonProperty("comment") String comment) { this.id = id; this.name = name.getL(); - this.lessThan = ImmutableList.copyOf(lessThan); + if (lessThan == null || lessThan.isEmpty()) { + this.lessThan = new ArrayList<>(); + } else { + this.lessThan = ImmutableList.copyOf(lessThan); + } this.comment = comment; } From 290d07cb8816eb0062c06cb71ca8b64e7ab6eb71 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Wed, 31 Jul 2019 14:06:52 +0800 Subject: [PATCH 08/62] increase ci worker number (#965) --- .ci/integration_test.groovy | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/.ci/integration_test.groovy b/.ci/integration_test.groovy index 51f08c55f8..6eaaa42916 100644 --- a/.ci/integration_test.groovy +++ b/.ci/integration_test.groovy @@ -7,7 +7,7 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb def TIKV_BRANCH = "master" def PD_BRANCH = "master" def MVN_PROFILE = "" - def PARALLEL_NUMBER = 9 + def PARALLEL_NUMBER = 18 // parse tidb branch def m1 = ghprbCommentBody =~ /tidb\s*=\s*([^\s\\]+)(\s|\\|$)/ @@ -96,11 +96,16 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb sed -i 's/\\.scala//g' test shuf test -o test2 mv test2 test - split test -n r/$PARALLEL_NUMBER test_unit_ -a 1 --numeric-suffixes=1 + split test -n r/$PARALLEL_NUMBER test_unit_ -a 2 --numeric-suffixes=1 """ for (int i = 1; i <= PARALLEL_NUMBER; i++) { - sh """cat test_unit_$i""" + if(i < 10) { + sh """cat test_unit_0$i""" + } else { + sh """cat test_unit_$i""" + } + } sh """ @@ -120,7 +125,12 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb def run_tispark_test = { chunk_suffix -> dir("go/src/github.com/pingcap/tispark") { - run_chunks = readfile("test_unit_${chunk_suffix}") + if(chunk_suffix < 10) { + run_chunks = readfile("test_unit_0${chunk_suffix}") + } else { + run_chunks = readfile("test_unit_${chunk_suffix}") + } + print run_chunks def mvnStr = get_mvn_str(run_chunks) sh """ From f9e4be963902336114a49a1b1ab3db5f9860e2c5 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Wed, 31 Jul 2019 15:50:58 +0800 Subject: [PATCH 09/62] update readme for tispark-2.1.2 release (#968) --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 4a2b6805cc..46164c9b6f 100755 --- a/README.md +++ b/README.md @@ -17,9 +17,9 @@ The latest stable version compatible with **Spark 2.1.0+** is **TiSpark 1.2.1** **When using TiSpark 1.2.1, please follow the [document for Spark 2.1](./docs/userguide_spark2.1.md)** -**When using TiSpark 2.1.1 with Spark 2.3.0+, please use version `2.1.1-spark_2.3` and follow the [document for Spark 2.3+](./docs/userguide.md)** +**When using TiSpark 2.1.2 with Spark 2.3.0+, please use version `2.1.2-spark_2.3` and follow the [document for Spark 2.3+](./docs/userguide.md)** -**When using TiSpark 2.1.1 with Spark 2.4.0+, please use version `2.1.1-spark_2.4` and follow the [document for Spark 2.3+](./docs/userguide.md)** +**When using TiSpark 2.1.2 with Spark 2.4.0+, please use version `2.1.2-spark_2.4` and follow the [document for Spark 2.3+](./docs/userguide.md)** You may also [build from sources](#how-to-build-from-sources) to try the new features on TiSpark master branch. @@ -29,7 +29,7 @@ If you are using maven(recommended), add the following to your pom.xml: com.pingcap.tispark tispark-core - 2.1.1-spark_${spark.version} + 2.1.2-spark_${spark.version} ``` @@ -55,8 +55,8 @@ Remember to add `-Dmaven.test.skip=true` to skip all the tests if you don't need | Spark Version | Stable TiSpark Version | | ------------- | ---------------------- | -| Spark-2.4.x | TiSpark-2.1.1 | -| Spark-2.3.x | TiSpark-2.1.1 | +| Spark-2.4.x | TiSpark-2.1.2 | +| Spark-2.3.x | TiSpark-2.1.2 | | Spark-2.2.x | TiSpark-1.2.1 | | Spark-2.1.x | TiSpark-1.2.1 | From 10592d44c3bba37f68e47668a0c8ab22d0cf84be Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Thu, 1 Aug 2019 15:15:39 +0800 Subject: [PATCH 10/62] update document for pyspark (#975) --- docs/userguide.md | 4 ++ docs/userguide_spark2.1.md | 4 ++ python/README.md | 17 ++++++-- python/pytispark/__init__.py | 18 -------- python/pytispark/pytispark.py | 46 --------------------- python/resources/{ => spark-2.3}/session.py | 0 python/setup.cfg | 2 - python/setup.py | 27 ------------ 8 files changed, 21 insertions(+), 97 deletions(-) delete mode 100644 python/pytispark/__init__.py delete mode 100644 python/pytispark/pytispark.py rename python/resources/{ => spark-2.3}/session.py (100%) delete mode 100644 python/setup.cfg delete mode 100644 python/setup.py diff --git a/docs/userguide.md b/docs/userguide.md index 648ca4e86d..cb548cdc57 100644 --- a/docs/userguide.md +++ b/docs/userguide.md @@ -307,3 +307,7 @@ A: You can use the existing Spark cluster without a separate deployment, but if Q: Can I mix Spark with TiKV? A: If TiDB and TiKV are overloaded and run critical online tasks, consider deploying TiSpark separately. You also need to consider using different NICs to ensure that OLTP's network resources are not compromised and affect online business. If the online business requirements are not high or the loading is not large enough, you can consider mixing TiSpark with TiKV deployment. + +Q: How to use PySpark with TiSpark? + +A: Please follow [TiSpark on PySpark](../python/README.md). \ No newline at end of file diff --git a/docs/userguide_spark2.1.md b/docs/userguide_spark2.1.md index 271524dfd3..0423396313 100644 --- a/docs/userguide_spark2.1.md +++ b/docs/userguide_spark2.1.md @@ -338,3 +338,7 @@ A: You can use the existing Spark cluster without a separate deployment, but if Q: Can I mix Spark with TiKV? A: If TiDB and TiKV are overloaded and run critical online tasks, consider deploying TiSpark separately. You also need to consider using different NICs to ensure that OLTP's network resources are not compromised and affect online business. If the online business requirements are not high or the loading is not large enough, you can consider mixing TiSpark with TiKV deployment. + +Q: How to use PySpark with TiSpark? + +A: Please follow [TiSpark on PySpark](../python/README_spark2.1.md). \ No newline at end of file diff --git a/python/README.md b/python/README.md index 10b8b0f901..b211d16fe6 100644 --- a/python/README.md +++ b/python/README.md @@ -1,16 +1,25 @@ ## TiSpark (version >= 2.0) on PySpark: **Note: If you are using TiSpark version less than 2.0, please read [this document](./README_spark2.1.md) instead** -pytispark will not be necessary since TiSpark version >= 2.0. ### Usage There are currently two ways to use TiSpark on Python: + #### Directly via pyspark This is the simplest way, just a decent Spark environment should be enough. 1. Make sure you have the latest version of [TiSpark](https://github.com/pingcap/tispark) and a `jar` with all TiSpark's dependencies. 2. Remember to add needed configurations listed in [README](../README.md) into your `$SPARK_HOME/conf/spark-defaults.conf` -3. Copy `./resources/session.py` to `$SPARK_HOME/python/pyspark/sql/session.py` +3. For spark-2.3.x please copy `./resources/spark-2.3/session.py` to `$SPARK_HOME/python/pyspark/sql/session.py`. For other Spark version please edit the file `$SPARK_HOME/python/pyspark/sql/session.py` and change it from +```python +jsparkSession = self._jvm.SparkSession(self._jsc.sc()) +``` + +to + +```python +jsparkSession = self._jvm.SparkSession.builder().getOrCreate() +``` 4. Run this command in your `$SPARK_HOME` directory: ``` @@ -36,7 +45,7 @@ spark.sql("select count(*) from customer").show() #### Via spark-submit This way is useful when you want to execute your own Python scripts. -Because of an open issue **[SPARK-25003]** in Spark 2.3, using spark-submit for python files will only support following api +Because of an open issue **[SPARK-25003]** in Spark-2.3.x and Spark-2.4.x, using spark-submit for python files will only support following api 1. Use ```pip install pytispark``` in your console to install `pytispark` @@ -46,7 +55,7 @@ Note that you may need reinstall `pytispark` if you meet `No plan for reation` e ```python import pytispark.pytispark as pti from pyspark.sql import SparkSession -spark = SparkSession.getOrCreate() +spark = SparkSession.builder.getOrCreate() ti = pti.TiContext(spark) ti.tidbMapDatabase("tpch_test") diff --git a/python/pytispark/__init__.py b/python/pytispark/__init__.py deleted file mode 100644 index 9e05cfe020..0000000000 --- a/python/pytispark/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# -# Copyright 2017 PingCAP, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# See the License for the specific language governing permissions and -# limitations under the License. -# - -def main(): - """Entry point for the application script""" - print("Call your main application code here") diff --git a/python/pytispark/pytispark.py b/python/pytispark/pytispark.py deleted file mode 100644 index e978594efc..0000000000 --- a/python/pytispark/pytispark.py +++ /dev/null @@ -1,46 +0,0 @@ -# -# Copyright 2017 PingCAP, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from py4j.java_gateway import java_import -from pyspark.context import SparkContext - - -# TiContext -# Used for TiSpark -class TiContext: - """ - Create a new TiContext - :param sparkSession The spark session used for creating TiContext - """ - def __init__(self, sparkSession): - SparkContext._ensure_initialized() - gw = SparkContext._gateway - java_import(gw.jvm, "org.apache.spark.sql.TiExtensions") - self.ti = gw.jvm.TiExtensions.getInstance(sparkSession._jsparkSession).getOrCreateTiContext(sparkSession._jsparkSession) - - """ - Get the TiContext java representation - """ - def getContext(self): - return self.ti - - """ - Change TiContext designated database - :param dbName Database to map(switch to) - :param isPrefix Whether to use dbName As Prefix - :param loadStatistics Whether to use statistics information from TiDB - """ - def tidbMapDatabase(self, dbName, isPrefix=False, loadStatistics=True): - self.ti.tidbMapDatabase(dbName, isPrefix, loadStatistics) \ No newline at end of file diff --git a/python/resources/session.py b/python/resources/spark-2.3/session.py similarity index 100% rename from python/resources/session.py rename to python/resources/spark-2.3/session.py diff --git a/python/setup.cfg b/python/setup.cfg deleted file mode 100644 index 224a77957f..0000000000 --- a/python/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[metadata] -description-file = README.md \ No newline at end of file diff --git a/python/setup.py b/python/setup.py deleted file mode 100644 index 5c1eba7b50..0000000000 --- a/python/setup.py +++ /dev/null @@ -1,27 +0,0 @@ -from setuptools import setup -setup( - name='pytispark', - packages=['pytispark'], - version='2.0', - description='TiSpark support for python', - author='PingCAP', - author_email='novemser@gmail.com', - url='https://github.com/pingcap/tispark', - keywords=['tispark', 'spark', 'tidb', 'olap'], - license='Apache 2.0', - classifiers=[ - # How mature is this project? Common values are - # 3 - Alpha - # 4 - Beta - # 5 - Production/Stable - 'Development Status :: 3 - Alpha', - 'Intended Audience :: Developers', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - ], - install_requires=['pyspark==2.3.3', 'py4j==0.10.7'] -) From 22ee66e2c267eb3de740291c8510ee4643470433 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Thu, 1 Aug 2019 17:40:07 +0800 Subject: [PATCH 11/62] fix one jar bug (#972) --- .../tispark/utils/ReflectionUtil.scala | 76 ++++++++++++++++--- .../org/apache/spark/sql/TiContext.scala | 7 +- .../org/apache/spark/sql/TiStrategy.scala | 5 +- .../catalog/TiConcreteSessionCatalog.scala | 10 ++- .../expressions/aggregate/SpecialSum.scala | 3 +- .../spark/sql/execution/CoprocessorRDD.scala | 10 +-- .../spark/sql/execution/command/tables.scala | 19 ++--- .../apache/spark/sql/extensions/rules.scala | 5 +- .../com/pingcap/tispark/SparkWrapper.scala | 19 +++++ .../com/pingcap/tispark/SparkWrapper.scala | 19 +++++ 10 files changed, 138 insertions(+), 35 deletions(-) diff --git a/core/src/main/scala/com/pingcap/tispark/utils/ReflectionUtil.scala b/core/src/main/scala/com/pingcap/tispark/utils/ReflectionUtil.scala index 9606d3485a..1f839a372a 100644 --- a/core/src/main/scala/com/pingcap/tispark/utils/ReflectionUtil.scala +++ b/core/src/main/scala/com/pingcap/tispark/utils/ReflectionUtil.scala @@ -25,8 +25,9 @@ import org.apache.spark.sql.TiContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.catalog.{ExternalCatalog, TiSessionCatalog} import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression -import org.apache.spark.sql.catalyst.expressions.{NamedExpression, UnsafeRow} -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression, NamedExpression, UnsafeRow} +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} +import org.apache.spark.sql.types.{DataType, Metadata} import org.slf4j.LoggerFactory import scala.reflect.ClassTag @@ -39,6 +40,13 @@ import scala.reflect.ClassTag object ReflectionUtil { private val logger = LoggerFactory.getLogger(getClass.getName) + private val SPARK_WRAPPER_CLASS = "com.pingcap.tispark.SparkWrapper" + private val TI_AGGREGATION_IMPL_CLASS = "org.apache.spark.sql.TiAggregationImpl" + private val TI_DIRECT_EXTERNAL_CATALOG_CLASS = + "org.apache.spark.sql.catalyst.catalog.TiDirectExternalCatalog" + private val TI_COMPOSITE_SESSION_CATALOG_CLASS = + "org.apache.spark.sql.catalyst.catalog.TiCompositeSessionCatalog" + // In Spark 2.3.0 and 2.3.1 the method declaration is: // private[spark] def mapPartitionsWithIndexInternal[U: ClassTag]( // f: (Int, Iterator[T]) => Iterator[U], @@ -133,18 +141,32 @@ object ReflectionUtil { } def newTiDirectExternalCatalog(tiContext: TiContext): ExternalCatalog = { - val clazz = - classLoader.loadClass("org.apache.spark.sql.catalyst.catalog.TiDirectExternalCatalog") - clazz + classLoader + .loadClass(TI_DIRECT_EXTERNAL_CATALOG_CLASS) .getDeclaredConstructor(classOf[TiContext]) .newInstance(tiContext) .asInstanceOf[ExternalCatalog] } + def callTiDirectExternalCatalogDatabaseExists(obj: Object, db: String): Boolean = { + classLoader + .loadClass(TI_DIRECT_EXTERNAL_CATALOG_CLASS) + .getDeclaredMethod("databaseExists", classOf[String]) + .invoke(obj, db) + .asInstanceOf[Boolean] + } + + def callTiDirectExternalCatalogTableExists(obj: Object, db: String, table: String): Boolean = { + classLoader + .loadClass(TI_DIRECT_EXTERNAL_CATALOG_CLASS) + .getDeclaredMethod("tableExists", classOf[String], classOf[String]) + .invoke(obj, db, table) + .asInstanceOf[Boolean] + } + def newTiCompositeSessionCatalog(tiContext: TiContext): TiSessionCatalog = { - val clazz = - classLoader.loadClass("org.apache.spark.sql.catalyst.catalog.TiCompositeSessionCatalog") - clazz + classLoader + .loadClass(TI_COMPOSITE_SESSION_CATALOG_CLASS) .getDeclaredConstructor(classOf[TiContext]) .newInstance(tiContext) .asInstanceOf[TiSessionCatalog] @@ -153,13 +175,45 @@ object ReflectionUtil { def callTiAggregationImplUnapply( plan: LogicalPlan ): Option[(Seq[NamedExpression], Seq[AggregateExpression], Seq[NamedExpression], LogicalPlan)] = { - val clazz = - classLoader.loadClass("org.apache.spark.sql.TiAggregationImpl") - clazz + classLoader + .loadClass(TI_AGGREGATION_IMPL_CLASS) .getDeclaredMethod("unapply", classOf[LogicalPlan]) .invoke(null, plan) .asInstanceOf[Option[ (Seq[NamedExpression], Seq[AggregateExpression], Seq[NamedExpression], LogicalPlan) ]] } + + def newSubqueryAlias(identifier: String, child: LogicalPlan): SubqueryAlias = { + classLoader + .loadClass(SPARK_WRAPPER_CLASS) + .getDeclaredMethod("newSubqueryAlias", classOf[String], classOf[LogicalPlan]) + .invoke(null, identifier, child) + .asInstanceOf[SubqueryAlias] + } + + def newAlias(child: Expression, name: String): Alias = { + classLoader + .loadClass(SPARK_WRAPPER_CLASS) + .getDeclaredMethod("newAlias", classOf[Expression], classOf[String]) + .invoke(null, child, name) + .asInstanceOf[Alias] + } + + def newAttributeReference(name: String, + dataType: DataType, + nullable: java.lang.Boolean = false, + metadata: Metadata = Metadata.empty): AttributeReference = { + classLoader + .loadClass(SPARK_WRAPPER_CLASS) + .getDeclaredMethod( + "newAttributeReference", + classOf[String], + classOf[DataType], + classOf[Boolean], + classOf[Metadata] + ) + .invoke(null, name, dataType, nullable, metadata) + .asInstanceOf[AttributeReference] + } } diff --git a/core/src/main/scala/org/apache/spark/sql/TiContext.scala b/core/src/main/scala/org/apache/spark/sql/TiContext.scala index ff85589e44..539531d56f 100644 --- a/core/src/main/scala/org/apache/spark/sql/TiContext.scala +++ b/core/src/main/scala/org/apache/spark/sql/TiContext.scala @@ -23,7 +23,8 @@ import com.pingcap.tikv.{TiConfiguration, TiSession} import com.pingcap.tispark._ import com.pingcap.tispark.listener.CacheInvalidateListener import com.pingcap.tispark.statistics.StatisticsManager -import com.pingcap.tispark.utils.{ReflectionUtil, TiUtil} +import com.pingcap.tispark.utils.ReflectionUtil._ +import com.pingcap.tispark.utils.TiUtil import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.catalog._ @@ -59,11 +60,11 @@ class TiContext(val sparkSession: SparkSession, options: Option[TiDBOptions] = N tiSession.injectCallBackFunc(CacheInvalidateListener.getInstance()) lazy val tiConcreteCatalog: TiSessionCatalog = - new TiConcreteSessionCatalog(this)(ReflectionUtil.newTiDirectExternalCatalog(this)) + new TiConcreteSessionCatalog(this)(newTiDirectExternalCatalog(this)) lazy val sessionCatalog: SessionCatalog = sqlContext.sessionState.catalog - lazy val tiCatalog: TiSessionCatalog = ReflectionUtil.newTiCompositeSessionCatalog(this) + lazy val tiCatalog: TiSessionCatalog = newTiCompositeSessionCatalog(this) val debug: DebugTool = new DebugTool diff --git a/core/src/main/scala/org/apache/spark/sql/TiStrategy.scala b/core/src/main/scala/org/apache/spark/sql/TiStrategy.scala index e57bb7c10c..7afcb81f08 100644 --- a/core/src/main/scala/org/apache/spark/sql/TiStrategy.scala +++ b/core/src/main/scala/org/apache/spark/sql/TiStrategy.scala @@ -28,6 +28,7 @@ import com.pingcap.tikv.statistics.TableStatistics import com.pingcap.tispark.statistics.StatisticsManager import com.pingcap.tispark.utils.TiConverter._ import com.pingcap.tispark.utils.TiUtil +import com.pingcap.tispark.utils.ReflectionUtil._ import com.pingcap.tispark.{BasicExpression, TiConfigConst, TiDBRelation} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, _} @@ -389,11 +390,11 @@ case class TiStrategy(getOrCreateTiContext: SparkSession => TiContext)(sparkSess dagReq: TiDAGRequest ): Seq[SparkPlan] = { val deterministicAggAliases = aggregateExpressions.collect { - case e if e.deterministic => e.canonicalized -> Alias(e, e.toString())() + case e if e.deterministic => e.canonicalized -> newAlias(e, e.toString()) }.toMap def aliasPushedPartialResult(e: AggregateExpression): Alias = - deterministicAggAliases.getOrElse(e.canonicalized, Alias(e, e.toString())()) + deterministicAggAliases.getOrElse(e.canonicalized, newAlias(e, e.toString())) val residualAggregateExpressions = aggregateExpressions.map { aggExpr => // As `aggExpr` is being pushing down to TiKV, we need to replace the original Catalyst diff --git a/core/src/main/scala/org/apache/spark/sql/catalyst/catalog/TiConcreteSessionCatalog.scala b/core/src/main/scala/org/apache/spark/sql/catalyst/catalog/TiConcreteSessionCatalog.scala index fff7cd02d2..959fc7d103 100644 --- a/core/src/main/scala/org/apache/spark/sql/catalyst/catalog/TiConcreteSessionCatalog.scala +++ b/core/src/main/scala/org/apache/spark/sql/catalyst/catalog/TiConcreteSessionCatalog.scala @@ -15,6 +15,7 @@ package org.apache.spark.sql.catalyst.catalog +import com.pingcap.tispark.utils.ReflectionUtil._ import org.apache.spark.sql.TiContext import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.EmptyFunctionRegistry @@ -35,8 +36,13 @@ class TiConcreteSessionCatalog(val tiContext: TiContext)(tiExternalCatalog: Exte None } - override def databaseExists(db: String): Boolean = tiExternalCatalog.databaseExists(db) + override def databaseExists(db: String): Boolean = + callTiDirectExternalCatalogDatabaseExists(tiExternalCatalog, db) override def tableExists(name: TableIdentifier): Boolean = - tiExternalCatalog.tableExists(name.database.getOrElse(getCurrentDatabase), name.table) + callTiDirectExternalCatalogTableExists( + tiExternalCatalog, + name.database.getOrElse(getCurrentDatabase), + name.table + ) } diff --git a/core/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/SpecialSum.scala b/core/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/SpecialSum.scala index cd753b2baf..ddeb36bc3d 100644 --- a/core/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/SpecialSum.scala +++ b/core/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/SpecialSum.scala @@ -15,6 +15,7 @@ package org.apache.spark.sql.catalyst.expressions.aggregate +import com.pingcap.tispark.utils.ReflectionUtil._ import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions.{Add, AttributeReference, Cast, Coalesce, Expression, ExpressionDescription, Literal} import org.apache.spark.sql.catalyst.util.TypeUtils @@ -76,7 +77,7 @@ case class SpecialSum(child: Expression, retType: DataType, initVal: Any) private lazy val sumDataType = resultType - private lazy val sum = AttributeReference("rewriteSum", sumDataType)() + private lazy val sum = newAttributeReference("rewriteSum", sumDataType) private lazy val zero = Cast(Literal(0), sumDataType) diff --git a/core/src/main/scala/org/apache/spark/sql/execution/CoprocessorRDD.scala b/core/src/main/scala/org/apache/spark/sql/execution/CoprocessorRDD.scala index 33ecb1e5b9..19b99e0481 100644 --- a/core/src/main/scala/org/apache/spark/sql/execution/CoprocessorRDD.scala +++ b/core/src/main/scala/org/apache/spark/sql/execution/CoprocessorRDD.scala @@ -26,13 +26,13 @@ import com.pingcap.tikv.util.RangeSplitter.RegionTask import com.pingcap.tikv.util.{KeyRangeUtils, RangeSplitter} import com.pingcap.tikv.{TiConfiguration, TiSession, TiSessionCache} import com.pingcap.tispark.listener.CacheInvalidateListener -import com.pingcap.tispark.utils.ReflectionUtil.ReflectionMapPartitionWithIndexInternal +import com.pingcap.tispark.utils.ReflectionUtil._ import com.pingcap.tispark.utils.{TiConverter, TiUtil} import gnu.trove.list.array import gnu.trove.list.array.TLongArrayList import org.apache.log4j.Logger import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, GenericInternalRow, SortOrder, UnsafeProjection, UnsafeRow} +import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericInternalRow, SortOrder, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} @@ -149,13 +149,13 @@ case class HandleRDDExec(tiHandleRDDs: List[TiHandleRDD]) extends LeafExecNode { } final lazy val attributeRef = Seq( - AttributeReference("RegionId", LongType, nullable = false, Metadata.empty)(), - AttributeReference( + newAttributeReference("RegionId", LongType, nullable = false, Metadata.empty), + newAttributeReference( "Handles", ArrayType(LongType, containsNull = false), nullable = false, Metadata.empty - )() + ) ) override def output: Seq[Attribute] = attributeRef diff --git a/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index d2743e688a..756a7f8b02 100644 --- a/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -14,10 +14,11 @@ */ package org.apache.spark.sql.execution.command +import com.pingcap.tispark.utils.ReflectionUtil._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException import org.apache.spark.sql.catalyst.catalog._ -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} +import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.types.{MetadataBuilder, StringType, StructType} import org.apache.spark.sql.{AnalysisException, Row, SparkSession, TiContext} @@ -62,30 +63,30 @@ case class TiDescribeTablesCommand(tiContext: TiContext, delegate: DescribeTable extends TiCommand(delegate) { override val output: Seq[Attribute] = Seq( // Column names are based on Hive. - AttributeReference( + newAttributeReference( "col_name", StringType, nullable = false, new MetadataBuilder().putString("comment", "name of the column").build() - )(), - AttributeReference( + ), + newAttributeReference( "data_type", StringType, nullable = false, new MetadataBuilder().putString("comment", "data type of the column").build() - )(), - AttributeReference( + ), + newAttributeReference( "nullable", StringType, nullable = false, new MetadataBuilder().putString("comment", "whether the column is nullable").build() - )(), - AttributeReference( + ), + newAttributeReference( "comment", StringType, nullable = true, new MetadataBuilder().putString("comment", "comment of the column").build() - )() + ) ) override def run(sparkSession: SparkSession): Seq[Row] = diff --git a/core/src/main/scala/org/apache/spark/sql/extensions/rules.scala b/core/src/main/scala/org/apache/spark/sql/extensions/rules.scala index d9eeb87038..3e5cfd6c95 100644 --- a/core/src/main/scala/org/apache/spark/sql/extensions/rules.scala +++ b/core/src/main/scala/org/apache/spark/sql/extensions/rules.scala @@ -15,11 +15,12 @@ package org.apache.spark.sql.extensions import com.pingcap.tispark.statistics.StatisticsManager +import com.pingcap.tispark.utils.ReflectionUtil._ import com.pingcap.tispark.{MetaManager, TiDBRelation, TiTableReference} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, UnresolvedRelation} import org.apache.spark.sql.catalyst.catalog.TiSessionCatalog -import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan, SubqueryAlias} +import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.execution.datasources.LogicalRelation @@ -57,7 +58,7 @@ case class TiResolutionRule(getOrCreateTiContext: SparkSession => TiContext)( )(sqlContext) // Use SubqueryAlias so that projects and joins can correctly resolve // UnresolvedAttributes in JoinConditions, Projects, Filters, etc. - SubqueryAlias(tableName, LogicalRelation(tiDBRelation)) + newSubqueryAlias(tableName, LogicalRelation(tiDBRelation)) } protected def resolveTiDBRelations: PartialFunction[LogicalPlan, LogicalPlan] = { diff --git a/spark-wrapper/spark-2.3/src/main/scala/com/pingcap/tispark/SparkWrapper.scala b/spark-wrapper/spark-2.3/src/main/scala/com/pingcap/tispark/SparkWrapper.scala index 43eb696ce1..c64c825d02 100644 --- a/spark-wrapper/spark-2.3/src/main/scala/com/pingcap/tispark/SparkWrapper.scala +++ b/spark-wrapper/spark-2.3/src/main/scala/com/pingcap/tispark/SparkWrapper.scala @@ -14,8 +14,27 @@ */ package com.pingcap.tispark +import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression} +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} +import org.apache.spark.sql.types.{DataType, Metadata} + object SparkWrapper { def getVersion: String = { "SparkWrapper-2.3" } + + def newSubqueryAlias(identifier: String, child: LogicalPlan): SubqueryAlias = { + SubqueryAlias(identifier, child) + } + + def newAlias(child: Expression, name: String): Alias = { + Alias(child, name)() + } + + def newAttributeReference(name: String, + dataType: DataType, + nullable: Boolean, + metadata: Metadata): AttributeReference = { + AttributeReference(name, dataType, nullable, metadata)() + } } diff --git a/spark-wrapper/spark-2.4/src/main/scala/com/pingcap/tispark/SparkWrapper.scala b/spark-wrapper/spark-2.4/src/main/scala/com/pingcap/tispark/SparkWrapper.scala index e89c9b6fa1..044213090a 100644 --- a/spark-wrapper/spark-2.4/src/main/scala/com/pingcap/tispark/SparkWrapper.scala +++ b/spark-wrapper/spark-2.4/src/main/scala/com/pingcap/tispark/SparkWrapper.scala @@ -14,8 +14,27 @@ */ package com.pingcap.tispark +import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression} +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} +import org.apache.spark.sql.types.{DataType, Metadata} + object SparkWrapper { def getVersion: String = { "SparkWrapper-2.4" } + + def newSubqueryAlias(identifier: String, child: LogicalPlan): SubqueryAlias = { + SubqueryAlias(identifier, child) + } + + def newAlias(child: Expression, name: String): Alias = { + Alias(child, name)() + } + + def newAttributeReference(name: String, + dataType: DataType, + nullable: Boolean, + metadata: Metadata): AttributeReference = { + AttributeReference(name, dataType, nullable, metadata)() + } } From 86348d194b00646f1756fd1408eca959f34fa263 Mon Sep 17 00:00:00 2001 From: Zhexuan Yang Date: Thu, 1 Aug 2019 18:44:38 +0800 Subject: [PATCH 12/62] adding common port number used by spark cluster (#973) --- docs/userguide.md | 13 +++++++++++++ docs/userguide_spark2.1.md | 13 +++++++++++++ 2 files changed, 26 insertions(+) diff --git a/docs/userguide.md b/docs/userguide.md index cb548cdc57..405d177b3b 100644 --- a/docs/userguide.md +++ b/docs/userguide.md @@ -298,6 +298,19 @@ Currently you could adjust these configs in your spark.conf file. Currently, only range partition table is limited supported. If partition expression having function expression rather than `year` then partition pruning will not be applied. Such scan can be considered full table scan if there is no index in the schema. +## Common Port numbers used by Spark Cluster +|Port Name| Default Value Port Number | Configuration Property | Nots| +|---------------| ------------- |-----|-----| +|Master web UI | 8080 | spark.master.ui.port or SPARK_MASTER_WEBUI_PORT| The value set by the spark.master.ui.port property takes precedence. | +|Worker web UI | 8081| spark.worker.ui.port or SPARK_WORKER_WEBUI_PORT | The value set by the spark.worker.ui.port takes precedence.| +|History server web UI |18080 | spark.history.ui.port |Optional; only applies if you use the history server. | +|Master port | 7077 | SPARK_MASTER_PORT | | +|Master REST port | 6066 | spark.master.rest.port | Not needed if you disable the REST service. | +|Worker port | (random) | SPARK_WORKER_PORT | | +|Block manager port |(random) | spark.blockManager.port | | +|Shuffle server |7337 | spark.shuffle.service.port | Optional; only applies if you use the external shuffle service. | +|Application web UI |4040| spark.ui.port | if 4040 is used, then 4041 will be used + ## FAQ Q: What are the pros/cons of independent deployment as opposed to a shared resource with an existing Spark / Hadoop cluster? diff --git a/docs/userguide_spark2.1.md b/docs/userguide_spark2.1.md index 0423396313..fbfb882572 100644 --- a/docs/userguide_spark2.1.md +++ b/docs/userguide_spark2.1.md @@ -328,6 +328,19 @@ Currently you could adjust these configs in your spark.conf file. | Property Name | Default | Description | -------- | -----: | :----: | | spark.tispark.statistics.auto_load | true | Whether to load statistics info automatically during database mapping. | + +## Common Port numbers used by Spark Cluster +|Port Name| Default Value Port Number | Configuration Property | Nots| +|---------------| ------------- |-----|-----| +|Master web UI | 8080 | spark.master.ui.port or SPARK_MASTER_WEBUI_PORT| The value set by the spark.master.ui.port property takes precedence. | +|Worker web UI | 8081| spark.worker.ui.port or SPARK_WORKER_WEBUI_PORT | The value set by the spark.worker.ui.port takes precedence.| +|History server web UI |18080 | spark.history.ui.port |Optional; only applies if you use the history server. | +|Master port | 7077 | SPARK_MASTER_PORT | | +|Master REST port | 6066 | spark.master.rest.port | Not needed if you disable the REST service. | +|Worker port | (random) | SPARK_WORKER_PORT | | +|Block manager port |(random) | spark.blockManager.port | | +|Shuffle server |7337 | spark.shuffle.service.port | Optional; only applies if you use the external shuffle service. | +|Application web UI |4040| spark.ui.port | if 4040 is used, then 4041 will be used ## FAQ From 12d53b3a8207b13d4d9d4ab7e1e17b7dcd347582 Mon Sep 17 00:00:00 2001 From: Zhexuan Yang Date: Thu, 1 Aug 2019 22:24:58 +0800 Subject: [PATCH 13/62] fix cost model in table scan (#977) --- .../java/com/pingcap/tikv/predicates/TiKVScanAnalyzer.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tikv-client/src/main/java/com/pingcap/tikv/predicates/TiKVScanAnalyzer.java b/tikv-client/src/main/java/com/pingcap/tikv/predicates/TiKVScanAnalyzer.java index bd86b00f47..820b79ab30 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/predicates/TiKVScanAnalyzer.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/predicates/TiKVScanAnalyzer.java @@ -106,7 +106,6 @@ public TiKVScanPlan build() { // TODO: Fine-grained statistics usage Builder calculateCostAndEstimateCount(long tableColSize) { - cost = 100.0; cost *= tableColSize * TABLE_SCAN_COST_FACTOR; return this; } @@ -266,6 +265,8 @@ TiKVScanPlan buildIndexScan( TiKVScanPlan.Builder planBuilder = TiKVScanPlan.Builder.newBuilder(); ScanSpec result = extractConditions(conditions, table, index); + // this is calcuated for downgrade if there is no statstics info we can + // retrieve from TiKV. double cost = SelectivityCalculator.calcPseudoSelectivity(result); planBuilder.setCost(cost); From 9b7bb49505d8624b27782de3c4ef17c825c5e260 Mon Sep 17 00:00:00 2001 From: Zhexuan Yang Date: Fri, 2 Aug 2019 13:28:42 +0800 Subject: [PATCH 14/62] create an UninitializedType for TypeDecimal (#979) --- docs/userguide.md | 2 +- docs/userguide_spark2.1.md | 2 +- .../pingcap/tikv/types/DataTypeFactory.java | 1 + .../com/pingcap/tikv/types/DecimalType.java | 3 +- .../pingcap/tikv/types/UninitializedType.java | 85 +++++++++++++++++++ 5 files changed, 89 insertions(+), 4 deletions(-) create mode 100644 tikv-client/src/main/java/com/pingcap/tikv/types/UninitializedType.java diff --git a/docs/userguide.md b/docs/userguide.md index 405d177b3b..6e359b4735 100644 --- a/docs/userguide.md +++ b/docs/userguide.md @@ -323,4 +323,4 @@ A: If TiDB and TiKV are overloaded and run critical online tasks, consider deplo Q: How to use PySpark with TiSpark? -A: Please follow [TiSpark on PySpark](../python/README.md). \ No newline at end of file +A: Please follow [TiSpark on PySpark](../python/README.md). diff --git a/docs/userguide_spark2.1.md b/docs/userguide_spark2.1.md index fbfb882572..8c9f96f3e2 100644 --- a/docs/userguide_spark2.1.md +++ b/docs/userguide_spark2.1.md @@ -354,4 +354,4 @@ A: If TiDB and TiKV are overloaded and run critical online tasks, consider deplo Q: How to use PySpark with TiSpark? -A: Please follow [TiSpark on PySpark](../python/README_spark2.1.md). \ No newline at end of file +A: Please follow [TiSpark on PySpark](../python/README_spark2.1.md). diff --git a/tikv-client/src/main/java/com/pingcap/tikv/types/DataTypeFactory.java b/tikv-client/src/main/java/com/pingcap/tikv/types/DataTypeFactory.java index 9b331fc396..5c674c08a1 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/types/DataTypeFactory.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/types/DataTypeFactory.java @@ -45,6 +45,7 @@ public class DataTypeFactory { extractTypeMap(SetType.subTypes, SetType.class, builder, instBuilder); extractTypeMap(JsonType.subTypes, JsonType.class, builder, instBuilder); extractTypeMap(TimeType.subTypes, TimeType.class, builder, instBuilder); + extractTypeMap(UninitializedType.subTypes, UninitializedType.class, builder, instBuilder); dataTypeCreatorMap = builder.build(); dataTypeInstanceMap = instBuilder.build(); } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/types/DecimalType.java b/tikv-client/src/main/java/com/pingcap/tikv/types/DecimalType.java index e1ce8b94d2..1734ad6214 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/types/DecimalType.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/types/DecimalType.java @@ -32,8 +32,7 @@ public class DecimalType extends DataType { public static final DecimalType DECIMAL = new DecimalType(MySQLType.TypeNewDecimal); - public static final MySQLType[] subTypes = - new MySQLType[] {MySQLType.TypeNewDecimal, MySQLType.TypeDecimal}; + public static final MySQLType[] subTypes = new MySQLType[] {MySQLType.TypeNewDecimal}; private DecimalType(MySQLType tp) { super(tp); diff --git a/tikv-client/src/main/java/com/pingcap/tikv/types/UninitializedType.java b/tikv-client/src/main/java/com/pingcap/tikv/types/UninitializedType.java new file mode 100644 index 0000000000..4ae5eaee0f --- /dev/null +++ b/tikv-client/src/main/java/com/pingcap/tikv/types/UninitializedType.java @@ -0,0 +1,85 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package com.pingcap.tikv.types; + +import com.pingcap.tidb.tipb.ExprType; +import com.pingcap.tikv.codec.CodecDataInput; +import com.pingcap.tikv.codec.CodecDataOutput; +import com.pingcap.tikv.exception.ConvertNotSupportException; +import com.pingcap.tikv.exception.ConvertOverflowException; +import com.pingcap.tikv.meta.TiColumnInfo; + +/** + * UninitializedType is created to deal with MySQLType being 0. In TiDB, when type is 0, it + * indicates the type is not initialized and will not be applied during calculation process. + */ +public class UninitializedType extends DataType { + public static final UninitializedType DECIMAL = new UninitializedType(MySQLType.TypeDecimal); + public static final MySQLType[] subTypes = new MySQLType[] {MySQLType.TypeDecimal}; + + private UninitializedType(MySQLType tp) { + super(tp); + } + + UninitializedType(TiColumnInfo.InternalTypeHolder holder) { + super(holder); + } + + @Override + protected Object decodeNotNull(int flag, CodecDataInput cdi) { + throw new UnsupportedOperationException( + "UninitializedType cannot be applied in calculation process."); + } + + @Override + protected Object doConvertToTiDBType(Object value) + throws ConvertNotSupportException, ConvertOverflowException { + throw new UnsupportedOperationException( + "UninitializedType cannot be applied in calculation process."); + } + + @Override + protected void encodeKey(CodecDataOutput cdo, Object value) { + throw new UnsupportedOperationException( + "UninitializedType cannot be applied in calculation process."); + } + + @Override + protected void encodeValue(CodecDataOutput cdo, Object value) { + throw new UnsupportedOperationException( + "UninitializedType cannot be applied in calculation process."); + } + + @Override + protected void encodeProto(CodecDataOutput cdo, Object value) { + throw new UnsupportedOperationException( + "UninitializedType cannot be applied in calculation process."); + } + + @Override + public ExprType getProtoExprType() { + throw new UnsupportedOperationException( + "UninitializedType cannot be applied in calculation process."); + } + + @Override + public Object getOriginDefaultValueNonNull(String value, long version) { + throw new UnsupportedOperationException( + "UninitializedType cannot be applied in calculation process."); + } +} From b5e6e53cd5ba309d9a6ce90d12787c41b038afc4 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Fri, 2 Aug 2019 13:35:40 +0800 Subject: [PATCH 15/62] update sparkr doc (#976) --- R/.gitignore | 10 ----- R/DESCRIPTION | 11 ------ R/NAMESPACE | 1 - R/R/tisparkR.R | 41 -------------------- R/README.md | 76 ++++++++++++++++++++------------------ README.md | 2 +- docs/userguide.md | 8 +++- docs/userguide_spark2.1.md | 8 +++- python/README.md | 4 +- python/README_spark2.1.md | 6 +-- 10 files changed, 59 insertions(+), 108 deletions(-) delete mode 100644 R/.gitignore delete mode 100644 R/DESCRIPTION delete mode 100644 R/NAMESPACE delete mode 100644 R/R/tisparkR.R diff --git a/R/.gitignore b/R/.gitignore deleted file mode 100644 index 4470bb4eee..0000000000 --- a/R/.gitignore +++ /dev/null @@ -1,10 +0,0 @@ -*.o -*.so -*.Rd -lib -pkg/man -pkg/html -TiSparkR.Rproj -.Rproj.user -.Rhistory -.Rbuildignore diff --git a/R/DESCRIPTION b/R/DESCRIPTION deleted file mode 100644 index fbfbe62d5b..0000000000 --- a/R/DESCRIPTION +++ /dev/null @@ -1,11 +0,0 @@ -Package: TiSparkR -Type: Package -Title: TiSpark for R -Version: 1.1 -Author: PingCAP -Maintainer: Novemser -Description: A shabby thin layer to support TiSpark in R language. -License: Apache 2.0 -Copyright: 2017 PingCAP, Inc. -Encoding: UTF-8 -LazyData: true diff --git a/R/NAMESPACE b/R/NAMESPACE deleted file mode 100644 index d75f824ec6..0000000000 --- a/R/NAMESPACE +++ /dev/null @@ -1 +0,0 @@ -exportPattern("^[[:alpha:]]+") diff --git a/R/R/tisparkR.R b/R/R/tisparkR.R deleted file mode 100644 index ac7a272f51..0000000000 --- a/R/R/tisparkR.R +++ /dev/null @@ -1,41 +0,0 @@ -# -# Copyright 2017 PingCAP, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# See the License for the specific language governing permissions and -# limitations under the License. -# -# - -# Title : TiSparkR -# Objective : TiSpark entry for R -# Created by: novemser -# Created on: 17-11-1 - -# Function:createTiContext -# Create a new TiContext via the spark session passed in -# -# @return A new TiContext created on session -# @param session A Spark Session for TiContext creation -createTiContext <- function(session) { - sparkR.newJObject("org.apache.spark.sql.TiContext", session) -} - -# Function:tidbMapDatabase -# Mapping TiContext designated database to `dbName`. -# -# @param tiContext TiSpark context -# @param dbName Database name to map -# @param isPrefix Whether to use dbName As Prefix -# @param loadStatistics Whether to use statistics information from TiDB -tidbMapDatabase <- function(tiContext, dbName, isPrefix=FALSE, loadStatistics=TRUE) { - sparkR.callJMethod(tiContext, "tidbMapDatabase", dbName, isPrefix, loadStatistics) - paste("Mapping to database:", dbName) -} diff --git a/R/README.md b/R/README.md index bfffdee152..47f97a5ca2 100644 --- a/R/README.md +++ b/R/README.md @@ -1,42 +1,48 @@ ## TiSparkR -A thin layer build for supporting R language with TiSpark ### Usage -1. Download TiSparkR source code and build a binary package(run `R CMD build R` in TiSpark root directory). Install it to your local R library(e.g. via `R CMD INSTALL TiSparkR_1.0.0.tar.gz`) -2. Build or download TiSpark dependency jar `tispark-core-1.0-RC1-jar-with-dependencies.jar` [here](https://github.com/pingcap/tispark). -3. `cd` to your Spark home directory, and run +There are currently two ways to use TiSpark on SparkR: + +#### Directly via sparkR +This is the simplest way, just a decent Spark environment should be enough. +1. Make sure you have the latest version of [TiSpark](https://github.com/pingcap/tispark) and a `jar` with all TiSpark's dependencies. + +2. Remember to add needed configurations listed in [README](../README.md) into your `$SPARK_HOME/conf/spark-defaults.conf` + +3. Run this command in your `$SPARK_HOME` directory: ``` -./bin/sparkR --jars /where-ever-it-is/tispark-core-${version}-jar-with-dependencies.jar +./bin/sparkR --jars /where-ever-it-is/tispark-${name_with_version}.jar ``` -Note that you should replace the `TiSpark` jar path with your own. - -4. Use as below in your R console: + +4. To use TiSpark, run these commands: ```R -# import tisparkR library -> library(TiSparkR) -# create a TiContext instance -> ti <- createTiContext(spark) -# Map TiContext to database:tpch_test -> tidbMapDatabase(ti, "tpch_test") - -# Run a sql query -> customers <- sql("select * from customer") -# Print schema -> printSchema(customers) -root - |-- c_custkey: long (nullable = true) - |-- c_name: string (nullable = true) - |-- c_address: string (nullable = true) - |-- c_nationkey: long (nullable = true) - |-- c_phone: string (nullable = true) - |-- c_acctbal: decimal(15,2) (nullable = true) - |-- c_mktsegment: string (nullable = true) - |-- c_comment: string (nullable = true) - -# Run a count query -> count <- sql("select count(*) from customer") -# Print count result -> head(count) - count(1) -1 150 +sql("use tpch_test") +count <- sql("select count(*) from customer") +head(count) +``` + +#### Via spark-submit +This way is useful when you want to execute your own R scripts. + +1. Create a R file named `test.R` as below: +```R +library(SparkR) +sparkR.session() +sql("use tpch_test") +count <- sql("select count(*) from customer") +head(count) +``` + +2. Prepare your TiSpark environment as above and execute +```bash +./bin/spark-submit --jars /where-ever-it-is/tispark-${name_with_version}.jar test.R +``` + +3. Result: +``` ++--------+ +|count(1)| ++--------+ +| 150| ++--------+ ``` \ No newline at end of file diff --git a/README.md b/README.md index 46164c9b6f..f4e7b485ef 100755 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ https://github.com/pingcap/tispark/tree/master/tikv-client From Spark-shell: ``` -./bin/spark-shell --jars /wherever-it-is/tispark-core-${version}-jar-with-dependencies.jar +./bin/spark-shell --jars /wherever-it-is/tispark-${name_with_version}.jar ``` For TiSpark version >= 2.0: ``` diff --git a/docs/userguide.md b/docs/userguide.md index 6e359b4735..4ce2f7f241 100644 --- a/docs/userguide.md +++ b/docs/userguide.md @@ -112,13 +112,13 @@ Download the TiSpark's jar package [here](http://download.pingcap.org/tispark-la Running TiSpark on an existing Spark cluster does not require a reboot of the cluster. You can use Spark's `--jars` parameter to introduce TiSpark as a dependency: ``` -spark-shell --jars $your_path_to/tispark-core-${version}-jar-with-dependencies.jar +spark-shell --jars $your_path_to/tispark-${name_with_version}.jar ``` If you want to deploy TiSpark as a default component, simply place the TiSpark jar package into the jars path for each node of the Spark cluster and restart the Spark cluster: ``` -cp $your_path_to/tispark-core-${version}-jar-with-dependencies.jar $SPARK_HOME/jars +cp $your_path_to/tispark-${name_with_version}.jar $SPARK_HOME/jars ``` In this way, you can use either `Spark-Submit` or `Spark-Shell` to use TiSpark directly. @@ -324,3 +324,7 @@ A: If TiDB and TiKV are overloaded and run critical online tasks, consider deplo Q: How to use PySpark with TiSpark? A: Please follow [TiSpark on PySpark](../python/README.md). + +Q: How to use SparkR with TiSpark? + +A: Please follow [TiSpark on SparkR](../R/README.md). diff --git a/docs/userguide_spark2.1.md b/docs/userguide_spark2.1.md index 8c9f96f3e2..79d6ec3241 100644 --- a/docs/userguide_spark2.1.md +++ b/docs/userguide_spark2.1.md @@ -96,14 +96,14 @@ For the hybrid deployment of TiSpark and TiKV, add the TiSpark required resourc ## Deploy TiSpark -Download the TiSpark's jar package [here](http://download.pingcap.org/tispark-core-${version}-jar-with-dependencies.jar). +Download the TiSpark's jar package [here](https://github.com/pingcap/tispark/releases). ### Deploy TiSpark on the existing Spark cluster Running TiSpark on an existing Spark cluster does not require a reboot of the cluster. You can use Spark's `--jars` parameter to introduce TiSpark as a dependency: ``` -Spark-shell --jars $ PATH / tispark-core-${version}-jar-with-dependencies.jar +Spark-shell --jars $ PATH / tispark-${name_with_version}.jar ``` If you want to deploy TiSpark as a default component, simply place the TiSpark jar package into the jars path for each node of the Spark cluster and restart the Spark cluster: @@ -355,3 +355,7 @@ A: If TiDB and TiKV are overloaded and run critical online tasks, consider deplo Q: How to use PySpark with TiSpark? A: Please follow [TiSpark on PySpark](../python/README_spark2.1.md). + +Q: How to use SparkR with TiSpark? + +A: Please follow [TiSpark on SparkR](../R/README.md). diff --git a/python/README.md b/python/README.md index b211d16fe6..bf5f0c3864 100644 --- a/python/README.md +++ b/python/README.md @@ -23,7 +23,7 @@ jsparkSession = self._jvm.SparkSession.builder().getOrCreate() 4. Run this command in your `$SPARK_HOME` directory: ``` -./bin/pyspark --jars /where-ever-it-is/tispark-core-${version}-jar-with-dependencies.jar +./bin/pyspark --jars /where-ever-it-is/tispark-${name_with_version}.jar ``` 5. To use TiSpark, run these commands: @@ -72,7 +72,7 @@ spark.sql("select count(*) from customer").show() 3. Prepare your TiSpark environment as above and execute ```bash -./bin/spark-submit --jars /where-ever-it-is/tispark-core-${version}-jar-with-dependencies.jar test.py +./bin/spark-submit --jars /where-ever-it-is/tispark-${name_with_version}.jar test.py ``` 4. Result: diff --git a/python/README_spark2.1.md b/python/README_spark2.1.md index 4fc1448bf7..2ec73823a3 100644 --- a/python/README_spark2.1.md +++ b/python/README_spark2.1.md @@ -7,7 +7,7 @@ This is the simplest way, just a decent Spark environment should be enough. 2. Run this command in your `SPARK_HOME` directory: ``` -./bin/pyspark --jars /where-ever-it-is/tispark-core-${version}-jar-with-dependencies.jar +./bin/pyspark --jars /where-ever-it-is/tispark-${name_with_version}.jar ``` 3. To use TiSpark, run these commands: @@ -46,7 +46,7 @@ This way is generally the same as the first way, but more readable. 3. Run this command in your `SPARK_HOME` directory: ``` -./bin/pyspark --jars /where-ever-it-is/tispark-core-${version}-jar-with-dependencies.jar +./bin/pyspark --jars /where-ever-it-is/tispark-${name_with_version}.jar ``` 4. Use as below: @@ -86,7 +86,7 @@ spark.sql("select count(*) from customer").show() 2. Prepare your TiSpark environment as above and execute ```bash -./bin/spark-submit --jars /where-ever-it-is/tispark-core-${version}-jar-with-dependencies.jar test.py +./bin/spark-submit --jars /where-ever-it-is/tispark-${name_with_version}.jar test.py ``` 3. Result: From 0586fa6b23f6d5530fd298dd764d83dbb43f8752 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Fri, 2 Aug 2019 15:58:11 +0800 Subject: [PATCH 16/62] use spark-2.4.3 to run ut (#978) * use spark-2.4.3 to run ut * fix ci --- .ci/integration_test.groovy | 20 +- .ci/log4j-ci.properties | 1 + core-test/pom.xml | 254 ++++++++++++++++++ core-test/src/KEEPME | 0 core/pom.xml | 15 +- core/scripts/fetch-test-data.sh | 8 +- core/scripts/version.sh | 5 +- .../tispark/utils/ReflectionUtil.scala | 16 +- .../spark/sql/execution/command/tables.scala | 2 +- pom.xml | 31 ++- .../com/pingcap/tispark/SparkWrapper.scala | 7 + .../com/pingcap/tispark/SparkWrapper.scala | 7 + tikv-client/pom.xml | 1 - tikv-client/scripts/proto.sh | 6 + 14 files changed, 336 insertions(+), 37 deletions(-) create mode 100644 core-test/pom.xml create mode 100644 core-test/src/KEEPME diff --git a/.ci/integration_test.groovy b/.ci/integration_test.groovy index 6eaaa42916..ca0362af51 100644 --- a/.ci/integration_test.groovy +++ b/.ci/integration_test.groovy @@ -6,7 +6,7 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb def TIDB_BRANCH = "master" def TIKV_BRANCH = "master" def PD_BRANCH = "master" - def MVN_PROFILE = "" + def MVN_PROFILE = "-Pjenkins" def PARALLEL_NUMBER = 18 // parse tidb branch @@ -33,7 +33,7 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb // parse mvn profile def m4 = ghprbCommentBody =~ /profile\s*=\s*([^\s\\]+)(\s|\\|$)/ if (m4) { - MVN_PROFILE = "-P${m4[0][1]}" + MVN_PROFILE = MVN_PROFILE + " -P${m4[0][1]}" } def readfile = { filename -> @@ -105,13 +105,14 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb } else { sh """cat test_unit_$i""" } - } sh """ - cd tikv-client - ./scripts/proto.sh - cd .. + cp .ci/log4j-ci.properties core/src/test/resources/log4j.properties + bash core/scripts/version.sh + bash core/scripts/fetch-test-data.sh + mv core/src/test core-test/src/ + bash tikv-client/scripts/proto.sh """ } @@ -138,10 +139,9 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb if [ ! "\$(ls -A /maven/.m2/repository)" ]; then curl -sL \$archive_url | tar -zx -C /maven || true; fi """ sh """ - cp .ci/log4j-ci.properties core/src/test/resources/log4j.properties export MAVEN_OPTS="-Xmx6G -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=51M" - mvn compile ${MVN_PROFILE} -DskipCloneProtoFiles=true - mvn test ${MVN_PROFILE} -Dtest=moo ${mvnStr} -DskipCloneProtoFiles=true + mvn compile ${MVN_PROFILE} + mvn test ${MVN_PROFILE} -Dtest=moo ${mvnStr} """ } } @@ -154,7 +154,7 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb """ sh """ export MAVEN_OPTS="-Xmx6G -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512M" - mvn test ${MVN_PROFILE} -am -pl tikv-client -DskipCloneProtoFiles=true + mvn test ${MVN_PROFILE} -am -pl tikv-client """ unstash "CODECOV_TOKEN" sh 'curl -s https://codecov.io/bash | bash -s - -t @CODECOV_TOKEN' diff --git a/.ci/log4j-ci.properties b/.ci/log4j-ci.properties index 53e3e6bb05..cbf0f4a305 100644 --- a/.ci/log4j-ci.properties +++ b/.ci/log4j-ci.properties @@ -24,3 +24,4 @@ log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR # tispark log4j.logger.com.pingcap=ERROR +log4j.logger.com.pingcap.tispark.utils.ReflectionUtil=DEBUG diff --git a/core-test/pom.xml b/core-test/pom.xml new file mode 100644 index 0000000000..f963b81a5c --- /dev/null +++ b/core-test/pom.xml @@ -0,0 +1,254 @@ + + + 4.0.0 + + com.pingcap.tispark + tispark-parent + 2.3.0-SNAPSHOT + ../pom.xml + + + tispark-core-test + jar + TiSpark Project Core Test + http://github.copm/pingcap/tispark + + + 2.3.0 + + + + + com.pingcap.tispark + tispark-core + ${project.parent.version} + + + org.apache.spark + spark-core_${scala.binary.version} + + + org.apache.spark + spark-catalyst_${scala.binary.version} + + + org.apache.spark + spark-sql_${scala.binary.version} + + + org.apache.spark + spark-hive_${scala.binary.version} + + + org.apache.spark + spark-hive-thriftserver_${scala.binary.version} + + + org.apache.spark + spark-unsafe_${scala.binary.version} + + + + + com.pingcap.tikv + tikv-client + ${project.parent.version} + + + org.scalaj + scalaj-http_${scala.binary.version} + ${scalaj.version} + + + org.scala-lang + scala-library + + + + + org.apache.logging.log4j + log4j-api + 2.8.1 + + + org.apache.logging.log4j + log4j-core + 2.8.1 + + + org.apache.spark + spark-core_${scala.binary.version} + ${spark.version.test} + + + org.apache.spark + spark-catalyst_${scala.binary.version} + ${spark.version.test} + + + org.apache.spark + spark-sql_${scala.binary.version} + ${spark.version.test} + + + org.apache.spark + spark-hive_${scala.binary.version} + ${spark.version.test} + + + org.apache.hadoop + hadoop-client + + + + + org.apache.spark + spark-hive-thriftserver_${scala.binary.version} + ${spark.version.test} + + + org.apache.spark + spark-unsafe_${scala.binary.version} + ${spark.version.test} + + + + org.scalatest + scalatest_${scala.binary.version} + ${scalatest.version} + test + + + mysql + mysql-connector-java + ${mysql.connector.version} + + + + com.pingcap.tidb + tidb-datanucleus-adapter + 1.0 + runtime + + + + + + + net.alchim31.maven + scala-maven-plugin + 3.2.0 + + + compile-scala + compile + + add-source + compile + + + + test-compile-scala + test-compile + + add-source + testCompile + + + + attach-javadocs + + doc-jar + + + + + ${scala.version} + + + + + org.codehaus.mojo + exec-maven-plugin + 1.6.0 + + ${skipFetchTestData} + + + + version imprint + + ${basedir}/scripts/version.sh + + validate + + exec + + + + test data fetch + validate + + ${basedir}/scripts/fetch-test-data.sh + + + exec + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 2.3.2 + + 1.8 + 1.8 + UTF-8 + true + true + + + + + org.apache.maven.plugins + maven-clean-plugin + 2.4.1 + + + org.scalatest + scalatest-maven-plugin + 2.0.0 + + ${project.build.directory}/surefire-reports + . + WDF TestSuite.txt + -Dfile.encoding=UTF-8 -Duser.timezone=GMT+8 -Dio.netty.leakDetection.level=paranoid + + + + test + + test + + + + + + org.apache.maven.plugins + maven-deploy-plugin + + true + + + + org.apache.maven.plugins + maven-install-plugin + + true + + + + + diff --git a/core-test/src/KEEPME b/core-test/src/KEEPME new file mode 100644 index 0000000000..e69de29bb2 diff --git a/core/pom.xml b/core/pom.xml index 3e6928308e..d384f2d06c 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -48,22 +48,22 @@ org.apache.spark spark-core_${scala.binary.version} - ${spark.version} + ${spark.version.compile} org.apache.spark spark-catalyst_${scala.binary.version} - ${spark.version} + ${spark.version.compile} org.apache.spark spark-sql_${scala.binary.version} - ${spark.version} + ${spark.version.compile} org.apache.spark spark-hive_${scala.binary.version} - ${spark.version} + ${spark.version.compile} org.apache.hadoop @@ -74,12 +74,12 @@ org.apache.spark spark-hive-thriftserver_${scala.binary.version} - ${spark.version} + ${spark.version.compile} org.apache.spark spark-unsafe_${scala.binary.version} - ${spark.version} + ${spark.version.compile} org.apache.hadoop @@ -169,6 +169,9 @@ org.codehaus.mojo exec-maven-plugin 1.6.0 + + ${skipFetchTestData} + version imprint diff --git a/core/scripts/fetch-test-data.sh b/core/scripts/fetch-test-data.sh index 86312ddc1a..1b6ebd23f3 100755 --- a/core/scripts/fetch-test-data.sh +++ b/core/scripts/fetch-test-data.sh @@ -1,3 +1,7 @@ #!/usr/bin/env bash -cd .. -git submodule update --init --recursive \ No newline at end of file + +CURRENT_DIR=`pwd` +TISPARK_HOME="$(cd "`dirname "$0"`"/../..; pwd)" +cd $TISPARK_HOME/core +git submodule update --init --recursive +cd $CURRENT_DIR \ No newline at end of file diff --git a/core/scripts/version.sh b/core/scripts/version.sh index 9c2ff18d51..9e3adc7082 100755 --- a/core/scripts/version.sh +++ b/core/scripts/version.sh @@ -14,7 +14,8 @@ # limitations under the License. # -cd .. +TISPARK_HOME="$(cd "`dirname "$0"`"/../..; pwd)" + TiSparkReleaseVersion=2.3.0-SNAPSHOT TiSparkBuildTS=`date -u '+%Y-%m-%d %I:%M:%S'` TiSparkGitHash=`git rev-parse HEAD` @@ -23,4 +24,4 @@ echo ' package com.pingcap.tispark object TiSparkVersion { - val version: String = "Release Version: '${TiSparkReleaseVersion}'\\nGit Commit Hash: '${TiSparkGitHash}'\\nGit Branch: '${TiSparkGitBranch}'\\nUTC Build Time: '${TiSparkBuildTS}'" }' > core/src/main/scala/com/pingcap/tispark/TiSparkVersion.scala + val version: String = "Release Version: '${TiSparkReleaseVersion}'\\nGit Commit Hash: '${TiSparkGitHash}'\\nGit Branch: '${TiSparkGitBranch}'\\nUTC Build Time: '${TiSparkBuildTS}'" }' > ${TISPARK_HOME}/core/src/main/scala/com/pingcap/tispark/TiSparkVersion.scala diff --git a/core/src/main/scala/com/pingcap/tispark/utils/ReflectionUtil.scala b/core/src/main/scala/com/pingcap/tispark/utils/ReflectionUtil.scala index 1f839a372a..0a3a0dd6c6 100644 --- a/core/src/main/scala/com/pingcap/tispark/utils/ReflectionUtil.scala +++ b/core/src/main/scala/com/pingcap/tispark/utils/ReflectionUtil.scala @@ -23,7 +23,7 @@ import com.pingcap.tispark.TiSparkInfo import org.apache.spark.rdd.RDD import org.apache.spark.sql.TiContext import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.catalog.{ExternalCatalog, TiSessionCatalog} +import org.apache.spark.sql.catalyst.catalog.{CatalogTable, ExternalCatalog, SessionCatalog, TiSessionCatalog} import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression, NamedExpression, UnsafeRow} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} @@ -216,4 +216,18 @@ object ReflectionUtil { .invoke(null, name, dataType, nullable, metadata) .asInstanceOf[AttributeReference] } + + def callSessionCatalogCreateTable(obj: SessionCatalog, + tableDefinition: CatalogTable, + ignoreIfExists: java.lang.Boolean): Unit = { + classLoader + .loadClass(SPARK_WRAPPER_CLASS) + .getDeclaredMethod( + "callSessionCatalogCreateTable", + classOf[SessionCatalog], + classOf[CatalogTable], + classOf[Boolean] + ) + .invoke(null, obj, tableDefinition, ignoreIfExists) + } } diff --git a/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index 756a7f8b02..7bcec604e5 100644 --- a/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -220,7 +220,7 @@ case class TiCreateTableLikeCommand(tiContext: TiContext, delegate: CreateTableL partitionColumnNames = sourceTableDesc.partitionColumnNames, bucketSpec = sourceTableDesc.bucketSpec ) - catalog.createTable(newTableDesc, delegate.ifNotExists) + callSessionCatalogCreateTable(catalog, newTableDesc, delegate.ifNotExists) Seq.empty[Row] } } diff --git a/pom.xml b/pom.xml index 1ce1a765af..b113c1d930 100644 --- a/pom.xml +++ b/pom.xml @@ -64,7 +64,8 @@ UTF-8 UTF-8 3.1.0 - 2.3.3 + 2.3.3 + 2.4.3 2.11 2.11 3.0.4 @@ -73,21 +74,10 @@ fake gpg keyname true true + false + false - - - spark-2.3 - - - - spark-2.4 - - 2.4.3 - - - - ossrh @@ -107,6 +97,19 @@ assembly + + + jenkins + + core-test + + + true + true + + + + diff --git a/spark-wrapper/spark-2.3/src/main/scala/com/pingcap/tispark/SparkWrapper.scala b/spark-wrapper/spark-2.3/src/main/scala/com/pingcap/tispark/SparkWrapper.scala index c64c825d02..f66f45fb9d 100644 --- a/spark-wrapper/spark-2.3/src/main/scala/com/pingcap/tispark/SparkWrapper.scala +++ b/spark-wrapper/spark-2.3/src/main/scala/com/pingcap/tispark/SparkWrapper.scala @@ -14,6 +14,7 @@ */ package com.pingcap.tispark +import org.apache.spark.sql.catalyst.catalog.{CatalogTable, SessionCatalog} import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} import org.apache.spark.sql.types.{DataType, Metadata} @@ -37,4 +38,10 @@ object SparkWrapper { metadata: Metadata): AttributeReference = { AttributeReference(name, dataType, nullable, metadata)() } + + def callSessionCatalogCreateTable(obj: SessionCatalog, + tableDefinition: CatalogTable, + ignoreIfExists: Boolean): Unit = { + obj.createTable(tableDefinition, ignoreIfExists) + } } diff --git a/spark-wrapper/spark-2.4/src/main/scala/com/pingcap/tispark/SparkWrapper.scala b/spark-wrapper/spark-2.4/src/main/scala/com/pingcap/tispark/SparkWrapper.scala index 044213090a..f4c8bf59df 100644 --- a/spark-wrapper/spark-2.4/src/main/scala/com/pingcap/tispark/SparkWrapper.scala +++ b/spark-wrapper/spark-2.4/src/main/scala/com/pingcap/tispark/SparkWrapper.scala @@ -14,6 +14,7 @@ */ package com.pingcap.tispark +import org.apache.spark.sql.catalyst.catalog.{CatalogTable, SessionCatalog} import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} import org.apache.spark.sql.types.{DataType, Metadata} @@ -37,4 +38,10 @@ object SparkWrapper { metadata: Metadata): AttributeReference = { AttributeReference(name, dataType, nullable, metadata)() } + + def callSessionCatalogCreateTable(obj: SessionCatalog, + tableDefinition: CatalogTable, + ignoreIfExists: Boolean): Unit = { + obj.createTable(tableDefinition, ignoreIfExists) + } } diff --git a/tikv-client/pom.xml b/tikv-client/pom.xml index 549c789ca6..d367155c35 100644 --- a/tikv-client/pom.xml +++ b/tikv-client/pom.xml @@ -28,7 +28,6 @@ 2.9.9 1.9.2 ${basedir}/proto - false diff --git a/tikv-client/scripts/proto.sh b/tikv-client/scripts/proto.sh index 22aaee1c3d..4c765321c2 100755 --- a/tikv-client/scripts/proto.sh +++ b/tikv-client/scripts/proto.sh @@ -14,6 +14,10 @@ # limitations under the License. # +CURRENT_DIR=`pwd` +TISPARK_HOME="$(cd "`dirname "$0"`"/../..; pwd)" +cd $TISPARK_HOME/tikv-client + kvproto_hash=a4759dfe3753ce136d252578340bb2b33633ccfa raft_rs_hash=14f007b443935aef51cb161c5b368b54fc8ed176 @@ -37,3 +41,5 @@ if [ -d "tipb" ]; then else git clone https://github.com/pingcap/tipb; cd tipb; git checkout ${tipb_hash}; cd .. fi + +cd $CURRENT_DIR \ No newline at end of file From 1973d53004b3d45132127cdb09c5af14c9b2c9b6 Mon Sep 17 00:00:00 2001 From: Zhexuan Yang Date: Fri, 2 Aug 2019 17:04:17 +0800 Subject: [PATCH 17/62] a better design for get auto table id (#980) --- .../com/pingcap/tispark/TiBatchWrite.scala | 2 +- .../datasource/RowIDAllocatorSuite.scala | 6 +- .../tikv/allocator/RowIDAllocator.java | 154 +++++++++++- .../com/pingcap/tikv/catalog/Catalog.java | 17 -- .../tikv/catalog/CatalogTransaction.java | 236 +----------------- .../com/pingcap/tikv/codec/MetaCodec.java | 116 +++++++++ 6 files changed, 275 insertions(+), 256 deletions(-) create mode 100644 tikv-client/src/main/java/com/pingcap/tikv/codec/MetaCodec.java diff --git a/core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala b/core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala index 82506e1b40..44ddcd846b 100644 --- a/core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala +++ b/core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala @@ -387,7 +387,7 @@ class TiBatchWrite(@transient val df: DataFrame, .create( tiDBInfo.getId, tiTableInfo.getId, - catalog, + tiSession.getConf, tiTableInfo.isAutoIncColUnsigned, step ) diff --git a/core/src/test/scala/com/pingcap/tispark/datasource/RowIDAllocatorSuite.scala b/core/src/test/scala/com/pingcap/tispark/datasource/RowIDAllocatorSuite.scala index ed4f0ac79b..defd518d0f 100644 --- a/core/src/test/scala/com/pingcap/tispark/datasource/RowIDAllocatorSuite.scala +++ b/core/src/test/scala/com/pingcap/tispark/datasource/RowIDAllocatorSuite.scala @@ -18,7 +18,7 @@ class RowIDAllocatorSuite extends BaseTiSparkTest { ti.tiSession.getCatalog.getTable(dbName, tableName) // corner case allocate unsigned long's max value. val allocator = - RowIDAllocator.create(tiDBInfo.getId, tiTableInfo.getId, ti.tiSession.getCatalog, true, -2L) + RowIDAllocator.create(tiDBInfo.getId, tiTableInfo.getId, ti.tiSession.getConf, true, -2L) assert(allocator.getEnd - allocator.getStart == -2L) } @@ -35,11 +35,11 @@ class RowIDAllocatorSuite extends BaseTiSparkTest { val tiTableInfo = ti.tiSession.getCatalog.getTable(dbName, tableName) var allocator = - RowIDAllocator.create(tiDBInfo.getId, tiTableInfo.getId, ti.tiSession.getCatalog, false, 1000) + RowIDAllocator.create(tiDBInfo.getId, tiTableInfo.getId, ti.tiSession.getConf, false, 1000) assert(allocator.getEnd - allocator.getStart == 1000) allocator = RowIDAllocator - .create(tiDBInfo.getId, tiTableInfo.getId, ti.tiSession.getCatalog, false, 10000) + .create(tiDBInfo.getId, tiTableInfo.getId, ti.tiSession.getConf, false, 10000) assert(allocator.getEnd - allocator.getStart == 10000) } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/allocator/RowIDAllocator.java b/tikv-client/src/main/java/com/pingcap/tikv/allocator/RowIDAllocator.java index 2c08d0f95e..70e5cddb2f 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/allocator/RowIDAllocator.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/allocator/RowIDAllocator.java @@ -15,8 +15,20 @@ package com.pingcap.tikv.allocator; import com.google.common.primitives.UnsignedLongs; -import com.pingcap.tikv.catalog.Catalog; +import com.google.protobuf.ByteString; +import com.pingcap.tikv.Snapshot; +import com.pingcap.tikv.TiConfiguration; +import com.pingcap.tikv.TiSession; +import com.pingcap.tikv.TiSessionCache; +import com.pingcap.tikv.TwoPhaseCommitter; +import com.pingcap.tikv.codec.CodecDataInput; +import com.pingcap.tikv.codec.CodecDataOutput; +import com.pingcap.tikv.codec.MetaCodec; import com.pingcap.tikv.exception.TiBatchWriteException; +import com.pingcap.tikv.util.BackOffer; +import com.pingcap.tikv.util.ConcreteBackOffer; +import java.util.Arrays; +import java.util.function.Function; /** * RowIDAllocator read current start from TiKV and write back 'start+step' back to TiKV. It designs @@ -26,19 +38,21 @@ public final class RowIDAllocator { private long end; private final long dbId; private long step; + private final TiConfiguration conf; - private RowIDAllocator(long dbId, long step) { + private RowIDAllocator(long dbId, long step, TiConfiguration conf) { this.dbId = dbId; this.step = step; + this.conf = conf; } public static RowIDAllocator create( - long dbId, long tableId, Catalog catalog, boolean unsigned, long step) { - RowIDAllocator allocator = new RowIDAllocator(dbId, step); + long dbId, long tableId, TiConfiguration conf, boolean unsigned, long step) { + RowIDAllocator allocator = new RowIDAllocator(dbId, step, conf); if (unsigned) { - allocator.initUnsigned(catalog, tableId); + allocator.initUnsigned(TiSession.create(conf).createSnapshot(), tableId); } else { - allocator.initSigned(catalog, tableId); + allocator.initSigned(TiSession.create(conf).createSnapshot(), tableId); } return allocator; } @@ -51,10 +65,126 @@ public long getEnd() { return end; } - private void initSigned(Catalog catalog, long tableId) { + // set key value pair to tikv via two phase committer protocol. + private void set(ByteString key, byte[] value) { + TiSession session = TiSessionCache.getSession(conf); + TwoPhaseCommitter twoPhaseCommitter = + new TwoPhaseCommitter(conf, session.getTimestamp().getVersion()); + + twoPhaseCommitter.prewritePrimaryKey( + ConcreteBackOffer.newCustomBackOff(BackOffer.PREWRITE_MAX_BACKOFF), + key.toByteArray(), + value); + + twoPhaseCommitter.commitPrimaryKey( + ConcreteBackOffer.newCustomBackOff(BackOffer.BATCH_COMMIT_BACKOFF), + key.toByteArray(), + session.getTimestamp().getVersion()); + } + + private void updateMeta(ByteString key, byte[] oldVal, Snapshot snapshot) { + // 1. encode hash meta key + // 2. load meta via hash meta key from TiKV + // 3. update meta's filed count and set it back to TiKV + CodecDataOutput cdo = new CodecDataOutput(); + ByteString metaKey = MetaCodec.encodeHashMetaKey(cdo, key.toByteArray()); + long fieldCount; + ByteString metaVal = snapshot.get(metaKey); + + // decode long from bytes + // big endian the 8 bytes + fieldCount = new CodecDataInput(metaVal.toByteArray()).readLong(); + + // update meta field count only oldVal is null + if (oldVal == null || oldVal.length == 0) { + fieldCount++; + cdo.reset(); + cdo.writeLong(fieldCount); + + set(metaKey, cdo.toBytes()); + } + } + + private long updateHash( + ByteString key, + ByteString field, + Function calculateNewVal, + Snapshot snapshot) { + // 1. encode hash data key + // 2. get value in byte from get operation + // 3. calculate new value via calculateNewVal + // 4. check old value equals to new value or not + // 5. set the new value back to TiKV via 2pc + // 6. encode a hash meta key + // 7. update a hash meta field count if needed + + CodecDataOutput cdo = new CodecDataOutput(); + MetaCodec.encodeHashDataKey(cdo, key.toByteArray(), field.toByteArray()); + ByteString dataKey = cdo.toByteString(); + byte[] oldVal = snapshot.get(dataKey.toByteArray()); + + byte[] newVal = calculateNewVal.apply(oldVal); + if (Arrays.equals(newVal, oldVal)) { + // not need to update + return 0L; + } + + set(dataKey, newVal); + updateMeta(key, oldVal, snapshot); + return Long.parseLong(new String(newVal)); + } + + private boolean isDBExisted(long dbId, Snapshot snapshot) { + ByteString dbKey = MetaCodec.encodeDatabaseID(dbId); + ByteString json = MetaCodec.hashGet(MetaCodec.KEY_DBs, dbKey, snapshot); + if (json == null || json.isEmpty()) { + return false; + } + return true; + } + + private boolean isTableExisted(long dbId, long tableId, Snapshot snapshot) { + ByteString dbKey = MetaCodec.encodeDatabaseID(dbId); + ByteString tableKey = MetaCodec.tableKey(tableId); + return !MetaCodec.hashGet(dbKey, tableKey, snapshot).isEmpty(); + } + /** + * read current row id from TiKV and write the calculated value back to TiKV. The calculation rule + * is start(read from TiKV) + step. + */ + public long getAutoTableId(long dbId, long tableId, long step, Snapshot snapshot) { + if (isDBExisted(dbId, snapshot) && isTableExisted(dbId, tableId, snapshot)) { + return updateHash( + MetaCodec.encodeDatabaseID(dbId), + MetaCodec.autoTableIDKey(tableId), + (oldVal) -> { + long base = 0; + if (oldVal != null && oldVal.length != 0) { + base = Long.parseLong(new String(oldVal)); + } + + base += step; + return String.valueOf(base).getBytes(); + }, + snapshot); + } + + throw new IllegalArgumentException("table or database is not existed"); + } + + /** read current row id from TiKV according to database id and table id. */ + public long getAutoTableId(long dbId, long tableId, Snapshot snapshot) { + ByteString dbKey = MetaCodec.encodeDatabaseID(dbId); + ByteString tblKey = MetaCodec.autoTableIDKey(tableId); + ByteString val = MetaCodec.hashGet(dbKey, tblKey, snapshot); + if (val.isEmpty()) return 0L; + return Long.parseLong(val.toStringUtf8()); + } + + private void initSigned(Snapshot snapshot, long tableId) { long newEnd; // get new start from TiKV, and calculate new end and set it back to TiKV. - long newStart = catalog.getAutoTableId(dbId, tableId); + long newStart = getAutoTableId(dbId, tableId, snapshot); long tmpStep = Math.min(Long.MAX_VALUE - newStart, step); if (tmpStep != step) { throw new TiBatchWriteException("cannot allocate ids for this write"); @@ -62,15 +192,15 @@ private void initSigned(Catalog catalog, long tableId) { if (newStart == Long.MAX_VALUE) { throw new TiBatchWriteException("cannot allocate more ids since it "); } - newEnd = catalog.getAutoTableId(dbId, tableId, tmpStep); + newEnd = getAutoTableId(dbId, tableId, tmpStep, snapshot); end = newEnd; } - private void initUnsigned(Catalog catalog, long tableId) { + private void initUnsigned(Snapshot snapshot, long tableId) { long newEnd; // get new start from TiKV, and calculate new end and set it back to TiKV. - long newStart = catalog.getAutoTableId(dbId, tableId); + long newStart = getAutoTableId(dbId, tableId, snapshot); // for unsigned long, -1L is max value. long tmpStep = UnsignedLongs.min(-1L - newStart, step); if (tmpStep != step) { @@ -81,7 +211,7 @@ private void initUnsigned(Catalog catalog, long tableId) { throw new TiBatchWriteException( "cannot allocate more ids since the start reaches " + "unsigned long's max value "); } - newEnd = catalog.getAutoTableId(dbId, tableId, tmpStep); + newEnd = getAutoTableId(dbId, tableId, tmpStep, snapshot); end = newEnd; } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/catalog/Catalog.java b/tikv-client/src/main/java/com/pingcap/tikv/catalog/Catalog.java index 05179ab609..a0d093110e 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/catalog/Catalog.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/catalog/Catalog.java @@ -156,23 +156,6 @@ public Catalog( periodUnit); } - /** - * read current row id from TiKV and write the calculated value back to TiKV. The calculation rule - * is start(read from TiKV) + step. - */ - public synchronized long getAutoTableId(long dbId, long tableId, long step) { - Snapshot snapshot = snapshotProvider.get(); - CatalogTransaction newTrx = new CatalogTransaction(snapshot); - return newTrx.getAutoTableId(dbId, tableId, step); - } - - /** read current row id from TiKV according to database id and table id. */ - public synchronized long getAutoTableId(long dbId, long tableId) { - Snapshot snapshot = snapshotProvider.get(); - CatalogTransaction newTrx = new CatalogTransaction(snapshot); - return newTrx.getAutoTableId(dbId, tableId); - } - public synchronized void reloadCache(boolean loadTables) { Snapshot snapshot = snapshotProvider.get(); CatalogTransaction newTrx = new CatalogTransaction(snapshot); diff --git a/tikv-client/src/main/java/com/pingcap/tikv/catalog/CatalogTransaction.java b/tikv-client/src/main/java/com/pingcap/tikv/catalog/CatalogTransaction.java index d0788710b1..24b372cbc8 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/catalog/CatalogTransaction.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/catalog/CatalogTransaction.java @@ -15,253 +15,43 @@ package com.pingcap.tikv.catalog; -import static com.google.common.base.Preconditions.checkArgument; +import static com.pingcap.tikv.codec.MetaCodec.KEY_DBs; import com.fasterxml.jackson.core.JsonParseException; import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.ImmutableList; import com.google.protobuf.ByteString; -import com.pingcap.tikv.*; -import com.pingcap.tikv.codec.Codec.BytesCodec; -import com.pingcap.tikv.codec.Codec.IntegerCodec; +import com.pingcap.tikv.Snapshot; import com.pingcap.tikv.codec.CodecDataInput; -import com.pingcap.tikv.codec.CodecDataOutput; import com.pingcap.tikv.codec.KeyUtils; +import com.pingcap.tikv.codec.MetaCodec; import com.pingcap.tikv.exception.TiClientInternalException; import com.pingcap.tikv.meta.TiDBInfo; import com.pingcap.tikv.meta.TiTableInfo; -import com.pingcap.tikv.util.BackOffer; -import com.pingcap.tikv.util.ConcreteBackOffer; import com.pingcap.tikv.util.Pair; import java.nio.charset.StandardCharsets; -import java.util.*; -import java.util.function.Function; +import java.util.List; +import java.util.Objects; import org.apache.log4j.Logger; -import org.tikv.kvproto.Kvrpcpb; public class CatalogTransaction { protected static final Logger logger = Logger.getLogger(CatalogTransaction.class); private final Snapshot snapshot; - private final byte[] prefix; - private final TiConfiguration conf; - - private static final byte[] META_PREFIX = new byte[] {'m'}; - - private static final byte HASH_DATA_FLAG = 'h'; - private static final byte HASH_META_FLAG = 'H'; - private static final byte STR_DATA_FLAG = 's'; - - private static ByteString KEY_DBs = ByteString.copyFromUtf8("DBs"); - private static String KEY_TABLE = "Table"; - private static ByteString KEY_SCHEMA_VERSION = ByteString.copyFromUtf8("SchemaVersionKey"); - - private static final String ENCODED_DB_PREFIX = "DB"; - private static final String KEY_TID = "TID"; CatalogTransaction(Snapshot snapshot) { this.snapshot = snapshot; - this.conf = snapshot.getConf(); - this.prefix = META_PREFIX; - } - - private void encodeStringDataKey(CodecDataOutput cdo, byte[] key) { - cdo.write(prefix); - BytesCodec.writeBytes(cdo, key); - IntegerCodec.writeULong(cdo, STR_DATA_FLAG); - } - - private void encodeHashDataKey(CodecDataOutput cdo, byte[] key, byte[] field) { - cdo.write(prefix); - BytesCodec.writeBytes(cdo, key); - IntegerCodec.writeULong(cdo, HASH_DATA_FLAG); - BytesCodec.writeBytes(cdo, field); - } - - private ByteString encodeHashMetaKey(CodecDataOutput cdo, byte[] key) { - cdo.write(prefix); - BytesCodec.writeBytes(cdo, key); - IntegerCodec.writeULong(cdo, HASH_META_FLAG); - return cdo.toByteString(); - } - - private void encodeHashDataKeyPrefix(CodecDataOutput cdo, byte[] key) { - cdo.write(prefix); - BytesCodec.writeBytes(cdo, key); - IntegerCodec.writeULong(cdo, HASH_DATA_FLAG); - } - - private Pair decodeHashDataKey(ByteString rawKey) { - checkArgument( - KeyUtils.hasPrefix(rawKey, ByteString.copyFrom(prefix)), - "invalid encoded hash data key prefix: " + new String(prefix)); - CodecDataInput cdi = new CodecDataInput(rawKey.toByteArray()); - cdi.skipBytes(prefix.length); - byte[] key = BytesCodec.readBytes(cdi); - long typeFlag = IntegerCodec.readULong(cdi); - if (typeFlag != HASH_DATA_FLAG) { - throw new TiClientInternalException("Invalid hash data flag: " + typeFlag); - } - byte[] field = BytesCodec.readBytes(cdi); - return Pair.create(ByteString.copyFrom(key), ByteString.copyFrom(field)); - } - - private static ByteString autoTableIDKey(long tableId) { - return ByteString.copyFrom(String.format("%s:%d", KEY_TID, tableId).getBytes()); - } - - private static ByteString tableKey(long tableId) { - return ByteString.copyFrom(String.format("%s:%d", KEY_TABLE, tableId).getBytes()); - } - - private static ByteString encodeDatabaseID(long id) { - return ByteString.copyFrom(String.format("%s:%d", ENCODED_DB_PREFIX, id).getBytes()); - } - - private boolean isDBExisted(long dbId) { - return getDatabase(dbId) != null; - } - - private boolean isTableExisted(long dbId, long tableId) { - ByteString dbKey = encodeDatabaseID(dbId); - ByteString tableKey = tableKey(tableId); - return !hashGet(dbKey, tableKey).isEmpty(); - } - - // set key value pair to tikv via two phase committer protocol. - private void set(ByteString key, byte[] value) { - TiSession session = TiSessionCache.getSession(conf); - TwoPhaseCommitter twoPhaseCommitter = - new TwoPhaseCommitter(conf, session.getTimestamp().getVersion()); - - twoPhaseCommitter.prewritePrimaryKey( - ConcreteBackOffer.newCustomBackOff(BackOffer.PREWRITE_MAX_BACKOFF), - key.toByteArray(), - value); - - twoPhaseCommitter.commitPrimaryKey( - ConcreteBackOffer.newCustomBackOff(BackOffer.BATCH_COMMIT_BACKOFF), - key.toByteArray(), - session.getTimestamp().getVersion()); - } - - private void updateMeta(ByteString key, byte[] oldVal) { - // 1. encode hash meta key - // 2. load meta via hash meta key from TiKV - // 3. update meta's filed count and set it back to TiKV - CodecDataOutput cdo = new CodecDataOutput(); - ByteString metaKey = encodeHashMetaKey(cdo, key.toByteArray()); - long fieldCount; - ByteString metaVal = snapshot.get(metaKey); - - // decode long from bytes - // big endian the 8 bytes - fieldCount = new CodecDataInput(metaVal.toByteArray()).readLong(); - - // update meta field count only oldVal is null - if (oldVal == null || oldVal.length == 0) { - fieldCount++; - cdo.reset(); - cdo.writeLong(fieldCount); - - set(metaKey, cdo.toBytes()); - } - } - - private long updateHash( - ByteString key, ByteString field, Function calculateNewVal) { - // 1. encode hash data key - // 2. get value in byte from get operation - // 3. calculate new value via calculateNewVal - // 4. check old value equals to new value or not - // 5. set the new value back to TiKV via 2pc - // 6. encode a hash meta key - // 7. update a hash meta field count if needed - - CodecDataOutput cdo = new CodecDataOutput(); - encodeHashDataKey(cdo, key.toByteArray(), field.toByteArray()); - ByteString dataKey = cdo.toByteString(); - byte[] oldVal = snapshot.get(dataKey.toByteArray()); - - byte[] newVal = calculateNewVal.apply(oldVal); - if (Arrays.equals(newVal, oldVal)) { - // not need to update - return 0L; - } - - set(dataKey, newVal); - updateMeta(key, oldVal); - return Long.parseLong(new String(newVal)); - } - - public long getAutoTableId(long dbId, long tableId, long step) { - if (isDBExisted(dbId) && isTableExisted(dbId, tableId)) { - return updateHash( - encodeDatabaseID(dbId), - autoTableIDKey(tableId), - (oldVal) -> { - long base = 0; - if (oldVal != null && oldVal.length != 0) { - base = Long.parseLong(new String(oldVal)); - } - - base += step; - return String.valueOf(base).getBytes(); - }); - } - - throw new IllegalArgumentException("table or database is not existed"); - } - - public long getAutoTableId(long dbId, long tableId) { - ByteString dbKey = encodeDatabaseID(dbId); - ByteString tblKey = autoTableIDKey(tableId); - ByteString val = hashGet(dbKey, tblKey); - if (val.isEmpty()) return 0L; - return Long.parseLong(val.toStringUtf8()); - } - - private ByteString hashGet(ByteString key, ByteString field) { - CodecDataOutput cdo = new CodecDataOutput(); - encodeHashDataKey(cdo, key.toByteArray(), field.toByteArray()); - return snapshot.get(cdo.toByteString()); - } - - private ByteString bytesGet(ByteString key) { - CodecDataOutput cdo = new CodecDataOutput(); - encodeStringDataKey(cdo, key.toByteArray()); - return snapshot.get(cdo.toByteString()); - } - - private List> hashGetFields(ByteString key) { - CodecDataOutput cdo = new CodecDataOutput(); - encodeHashDataKeyPrefix(cdo, key.toByteArray()); - ByteString encodedKey = cdo.toByteString(); - - Iterator iterator = snapshot.scan(encodedKey); - List> fields = new ArrayList<>(); - while (iterator.hasNext()) { - Kvrpcpb.KvPair kv = iterator.next(); - if (kv == null || kv.getKey() == null) { - continue; - } - if (!KeyUtils.hasPrefix(kv.getKey(), encodedKey)) { - break; - } - fields.add(Pair.create(decodeHashDataKey(kv.getKey()).second, kv.getValue())); - } - - return fields; } long getLatestSchemaVersion() { - ByteString versionBytes = bytesGet(KEY_SCHEMA_VERSION); + ByteString versionBytes = MetaCodec.bytesGet(MetaCodec.KEY_SCHEMA_VERSION, this.snapshot); CodecDataInput cdi = new CodecDataInput(versionBytes.toByteArray()); return Long.parseLong(new String(cdi.toByteArray(), StandardCharsets.UTF_8)); } public List getDatabases() { - List> fields = hashGetFields(KEY_DBs); + List> fields = + MetaCodec.hashGetFields(MetaCodec.KEY_DBs, this.snapshot); ImmutableList.Builder builder = ImmutableList.builder(); for (Pair pair : fields) { builder.add(parseFromJson(pair.second, TiDBInfo.class)); @@ -270,8 +60,8 @@ public List getDatabases() { } TiDBInfo getDatabase(long id) { - ByteString dbKey = encodeDatabaseID(id); - ByteString json = hashGet(KEY_DBs, dbKey); + ByteString dbKey = MetaCodec.encodeDatabaseID(id); + ByteString json = MetaCodec.hashGet(KEY_DBs, dbKey, this.snapshot); if (json == null || json.isEmpty()) { return null; } @@ -279,11 +69,11 @@ TiDBInfo getDatabase(long id) { } List getTables(long dbId) { - ByteString dbKey = encodeDatabaseID(dbId); - List> fields = hashGetFields(dbKey); + ByteString dbKey = MetaCodec.encodeDatabaseID(dbId); + List> fields = MetaCodec.hashGetFields(dbKey, this.snapshot); ImmutableList.Builder builder = ImmutableList.builder(); for (Pair pair : fields) { - if (KeyUtils.hasPrefix(pair.first, ByteString.copyFromUtf8(KEY_TABLE))) { + if (KeyUtils.hasPrefix(pair.first, ByteString.copyFromUtf8(MetaCodec.KEY_TABLE))) { builder.add(parseFromJson(pair.second, TiTableInfo.class)); } } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/codec/MetaCodec.java b/tikv-client/src/main/java/com/pingcap/tikv/codec/MetaCodec.java new file mode 100644 index 0000000000..0d9f4c963f --- /dev/null +++ b/tikv-client/src/main/java/com/pingcap/tikv/codec/MetaCodec.java @@ -0,0 +1,116 @@ +package com.pingcap.tikv.codec; + +import static com.google.common.base.Preconditions.checkArgument; + +import com.google.protobuf.ByteString; +import com.pingcap.tikv.Snapshot; +import com.pingcap.tikv.codec.Codec.BytesCodec; +import com.pingcap.tikv.codec.Codec.IntegerCodec; +import com.pingcap.tikv.exception.TiClientInternalException; +import com.pingcap.tikv.util.Pair; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import org.tikv.kvproto.Kvrpcpb; +import org.tikv.kvproto.Kvrpcpb.KvPair; + +public class MetaCodec { + private static final byte[] META_PREFIX = new byte[] {'m'}; + private static final byte HASH_DATA_FLAG = 'h'; + private static final byte HASH_META_FLAG = 'H'; + private static final byte STR_DATA_FLAG = 's'; + + public static ByteString KEY_DBs = ByteString.copyFromUtf8("DBs"); + public static String KEY_TABLE = "Table"; + public static ByteString KEY_SCHEMA_VERSION = ByteString.copyFromUtf8("SchemaVersionKey"); + + public static final String ENCODED_DB_PREFIX = "DB"; + public static final String KEY_TID = "TID"; + + public static void encodeStringDataKey(CodecDataOutput cdo, byte[] key) { + cdo.write(META_PREFIX); + BytesCodec.writeBytes(cdo, key); + IntegerCodec.writeULong(cdo, STR_DATA_FLAG); + } + + public static void encodeHashDataKey(CodecDataOutput cdo, byte[] key, byte[] field) { + cdo.write(META_PREFIX); + BytesCodec.writeBytes(cdo, key); + IntegerCodec.writeULong(cdo, HASH_DATA_FLAG); + BytesCodec.writeBytes(cdo, field); + } + + public static ByteString encodeHashMetaKey(CodecDataOutput cdo, byte[] key) { + cdo.write(META_PREFIX); + BytesCodec.writeBytes(cdo, key); + IntegerCodec.writeULong(cdo, HASH_META_FLAG); + return cdo.toByteString(); + } + + public static void encodeHashDataKeyPrefix(CodecDataOutput cdo, byte[] key) { + cdo.write(META_PREFIX); + BytesCodec.writeBytes(cdo, key); + IntegerCodec.writeULong(cdo, HASH_DATA_FLAG); + } + + public static Pair decodeHashDataKey(ByteString rawKey) { + checkArgument( + KeyUtils.hasPrefix(rawKey, ByteString.copyFrom(META_PREFIX)), + "invalid encoded hash data key prefix: " + new String(META_PREFIX)); + CodecDataInput cdi = new CodecDataInput(rawKey.toByteArray()); + cdi.skipBytes(META_PREFIX.length); + byte[] key = BytesCodec.readBytes(cdi); + long typeFlag = IntegerCodec.readULong(cdi); + if (typeFlag != HASH_DATA_FLAG) { + throw new TiClientInternalException("Invalid hash data flag: " + typeFlag); + } + byte[] field = BytesCodec.readBytes(cdi); + return Pair.create(ByteString.copyFrom(key), ByteString.copyFrom(field)); + } + + public static ByteString autoTableIDKey(long tableId) { + return ByteString.copyFrom(String.format("%s:%d", KEY_TID, tableId).getBytes()); + } + + public static ByteString tableKey(long tableId) { + return ByteString.copyFrom(String.format("%s:%d", KEY_TABLE, tableId).getBytes()); + } + + public static ByteString encodeDatabaseID(long id) { + return ByteString.copyFrom(String.format("%s:%d", ENCODED_DB_PREFIX, id).getBytes()); + } + + public static ByteString hashGet(ByteString key, ByteString field, Snapshot snapshot) { + CodecDataOutput cdo = new CodecDataOutput(); + MetaCodec.encodeHashDataKey(cdo, key.toByteArray(), field.toByteArray()); + return snapshot.get(cdo.toByteString()); + } + + public static ByteString bytesGet(ByteString key, Snapshot snapshot) { + CodecDataOutput cdo = new CodecDataOutput(); + MetaCodec.encodeStringDataKey(cdo, key.toByteArray()); + return snapshot.get(cdo.toByteString()); + } + + public static List> hashGetFields( + ByteString key, Snapshot snapshot) { + CodecDataOutput cdo = new CodecDataOutput(); + MetaCodec.encodeHashDataKeyPrefix(cdo, key.toByteArray()); + ByteString encodedKey = cdo.toByteString(); + + Iterator iterator = snapshot.scan(encodedKey); + List> fields = new ArrayList<>(); + while (iterator.hasNext()) { + Kvrpcpb.KvPair kv = iterator.next(); + if (kv == null || kv.getKey() == null) { + continue; + } + if (!KeyUtils.hasPrefix(kv.getKey(), encodedKey)) { + break; + } + fields.add(Pair.create(MetaCodec.decodeHashDataKey(kv.getKey()).second, kv.getValue())); + } + + return fields; + } +} From 1a95291c8c1ad97e4c64de7ab6c8feec7b0bb2b3 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Mon, 5 Aug 2019 15:56:13 +0800 Subject: [PATCH 18/62] fix bug: ci SpecialTiDBTypeTestSuite failed with tidb-3.0.1 (#984) --- .../sql/types/SpecialTiDBTypeTestSuite.scala | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/sql/types/SpecialTiDBTypeTestSuite.scala b/core/src/test/scala/org/apache/spark/sql/types/SpecialTiDBTypeTestSuite.scala index c682960656..51bcbc2d72 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/SpecialTiDBTypeTestSuite.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/SpecialTiDBTypeTestSuite.scala @@ -22,17 +22,26 @@ import org.apache.spark.sql.BaseTiSparkTest class SpecialTiDBTypeTestSuite extends BaseTiSparkTest { test("adding time type index test") { + val query = "select * from t_t" + tidbStmt.execute("drop table if exists t_t") tidbStmt.execute("CREATE TABLE `t_t` (`t` time(3), index `idx_t`(t))") // NOTE: jdbc only allows time in day range whereas mysql time has much // larger range. tidbStmt.execute("INSERT INTO t_t (t) VALUES('18:59:59'),('17:59:59'),('12:59:59')") refreshConnections() - val df = spark.sql("select * from t_t") + val df = spark.sql(query) val data = dfData(df, df.schema.fields) - assert(data(0)(0) === Converter.convertStrToDuration("18:59:59")) - assert(data(1)(0) === Converter.convertStrToDuration("17:59:59")) - assert(data(2)(0) === Converter.convertStrToDuration("12:59:59")) + runTest( + query, + rSpark = data, + rTiDB = List( + List(Converter.convertStrToDuration("18:59:59")), + List(Converter.convertStrToDuration("17:59:59")), + List(Converter.convertStrToDuration("12:59:59")) + ), + skipJDBC = true + ) val where = spark.sql("select * from t_t where t = 46799000000000") val whereData = dfData(where, where.schema.fields) From 0c3bcfe193aa222eda48bbbb48ee68159d68aea6 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Mon, 5 Aug 2019 16:32:50 +0800 Subject: [PATCH 19/62] improve TiConfiguration getPdAddrsString function (#963) --- .../com/pingcap/tispark/TiBatchWrite.scala | 4 +- .../spark/sql/execution/CoprocessorRDD.scala | 4 +- .../spark/sql/tispark/TiHandleRDD.scala | 4 +- .../org/apache/spark/sql/tispark/TiRDD.scala | 2 +- .../main/java/com/pingcap/tikv/Snapshot.java | 2 +- .../com/pingcap/tikv/TiConfiguration.java | 15 +++++++- .../main/java/com/pingcap/tikv/TiSession.java | 19 +++++++++- .../java/com/pingcap/tikv/TiSessionCache.java | 25 ------------ .../com/pingcap/tikv/TwoPhaseCommitter.java | 4 +- .../tikv/allocator/RowIDAllocator.java | 3 +- .../com/pingcap/tikv/TiConfigurationTest.java | 38 +++++++++++++++++++ 11 files changed, 81 insertions(+), 39 deletions(-) delete mode 100644 tikv-client/src/main/java/com/pingcap/tikv/TiSessionCache.java create mode 100644 tikv-client/src/test/java/com/pingcap/tikv/TiConfigurationTest.java diff --git a/core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala b/core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala index 44ddcd846b..5a2b737cc2 100644 --- a/core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala +++ b/core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala @@ -128,7 +128,7 @@ class TiBatchWrite(@transient val df: DataFrame, tiTableRef = options.tiTableRef tiDBInfo = tiSession.getCatalog.getDatabase(tiTableRef.databaseName) tiTableInfo = tiSession.getCatalog.getTable(tiTableRef.databaseName, tiTableRef.tableName) - catalog = TiSessionCache.getSession(tiConf).getCatalog + catalog = TiSession.getInstance(tiConf).getCatalog if (tiTableInfo == null) { throw new NoSuchTableException(tiTableRef.databaseName, tiTableRef.tableName) @@ -433,7 +433,7 @@ class TiBatchWrite(@transient val df: DataFrame, private def generateDataToBeRemovedRdd(rdd: RDD[WrappedRow], startTs: TiTimestamp) = { rdd .mapPartitions { wrappedRows => - val snapshot = TiSessionCache.getSession(tiConf).createSnapshot(startTs) + val snapshot = TiSession.getInstance(tiConf).createSnapshot(startTs) wrappedRows.map { wrappedRow => val rowBuf = mutable.ListBuffer.empty[WrappedRow] // check handle key diff --git a/core/src/main/scala/org/apache/spark/sql/execution/CoprocessorRDD.scala b/core/src/main/scala/org/apache/spark/sql/execution/CoprocessorRDD.scala index 19b99e0481..bfb6cd8037 100644 --- a/core/src/main/scala/org/apache/spark/sql/execution/CoprocessorRDD.scala +++ b/core/src/main/scala/org/apache/spark/sql/execution/CoprocessorRDD.scala @@ -24,7 +24,7 @@ import com.pingcap.tikv.operation.iterator.CoprocessIterator import com.pingcap.tikv.operation.transformer.RowTransformer import com.pingcap.tikv.util.RangeSplitter.RegionTask import com.pingcap.tikv.util.{KeyRangeUtils, RangeSplitter} -import com.pingcap.tikv.{TiConfiguration, TiSession, TiSessionCache} +import com.pingcap.tikv.{TiConfiguration, TiSession} import com.pingcap.tispark.listener.CacheInvalidateListener import com.pingcap.tispark.utils.ReflectionUtil._ import com.pingcap.tispark.utils.{TiConverter, TiUtil} @@ -252,7 +252,7 @@ case class RegionTaskExec(child: SparkPlan, // For each partition, we do some initialization work val logger = Logger.getLogger(getClass.getName) logger.debug(s"In partition No.$index") - val session = TiSessionCache.getSession(tiConf) + val session = TiSession.getInstance(tiConf) session.injectCallBackFunc(callBackFunc) val batchSize = tiConf.getIndexScanBatchSize diff --git a/core/src/main/scala/org/apache/spark/sql/tispark/TiHandleRDD.scala b/core/src/main/scala/org/apache/spark/sql/tispark/TiHandleRDD.scala index 3e6d0ddf04..baddbd3c20 100644 --- a/core/src/main/scala/org/apache/spark/sql/tispark/TiHandleRDD.scala +++ b/core/src/main/scala/org/apache/spark/sql/tispark/TiHandleRDD.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.tispark import com.pingcap.tikv.meta.{TiDAGRequest, TiTimestamp} import com.pingcap.tikv.util.RangeSplitter import com.pingcap.tikv.util.RangeSplitter.RegionTask -import com.pingcap.tikv.{TiConfiguration, TiSession, TiSessionCache} +import com.pingcap.tikv.{TiConfiguration, TiSession} import com.pingcap.tispark.{TiPartition, TiTableReference} import gnu.trove.list.array.TLongArrayList import org.apache.spark.rdd.RDD @@ -52,7 +52,7 @@ class TiHandleRDD(val dagRequest: TiDAGRequest, new Iterator[Row] { dagRequest.resolve() private val tiPartition = split.asInstanceOf[TiPartition] - private val session = TiSessionCache.getSession(tiConf) + private val session = TiSession.getInstance(tiConf) private val snapshot = session.createSnapshot(ts) private[this] val tasks = tiPartition.tasks diff --git a/core/src/main/scala/org/apache/spark/sql/tispark/TiRDD.scala b/core/src/main/scala/org/apache/spark/sql/tispark/TiRDD.scala index dec0ad6c10..a5b4ab769a 100644 --- a/core/src/main/scala/org/apache/spark/sql/tispark/TiRDD.scala +++ b/core/src/main/scala/org/apache/spark/sql/tispark/TiRDD.scala @@ -61,7 +61,7 @@ class TiRDD(val dagRequest: TiDAGRequest, // bypass, sum return a long type private val tiPartition = split.asInstanceOf[TiPartition] - private val session = TiSessionCache.getSession(tiConf) + private val session = TiSession.getInstance(tiConf) session.injectCallBackFunc(callBackFunc) private val snapshot = session.createSnapshot(dagRequest.getStartTs) private[this] val tasks = tiPartition.tasks diff --git a/tikv-client/src/main/java/com/pingcap/tikv/Snapshot.java b/tikv-client/src/main/java/com/pingcap/tikv/Snapshot.java index d77b1d2de1..c2d5934413 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/Snapshot.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/Snapshot.java @@ -41,7 +41,7 @@ public class Snapshot { public Snapshot(@Nonnull TiTimestamp timestamp, TiConfiguration conf) { this.timestamp = timestamp; this.conf = conf; - this.session = TiSessionCache.getSession(conf); + this.session = TiSession.getInstance(conf); } public TiSession getSession() { diff --git a/tikv-client/src/main/java/com/pingcap/tikv/TiConfiguration.java b/tikv-client/src/main/java/com/pingcap/tikv/TiConfiguration.java index 1d69099c95..db1e6f2c46 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/TiConfiguration.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/TiConfiguration.java @@ -125,7 +125,20 @@ public List getPdAddrs() { } public String getPdAddrsString() { - return pdAddrs.toString(); + return listToString(pdAddrs); + } + + public static String listToString(List list) { + StringBuilder sb = new StringBuilder(); + sb.append("["); + for (int i = 0; i < list.size(); i++) { + sb.append(list.get(i).toString()); + if (i != list.size() - 1) { + sb.append(","); + } + } + sb.append("]"); + return sb.toString(); } public int getScanBatchSize() { diff --git a/tikv-client/src/main/java/com/pingcap/tikv/TiSession.java b/tikv-client/src/main/java/com/pingcap/tikv/TiSession.java index a36db1170b..60f00b17b9 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/TiSession.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/TiSession.java @@ -24,6 +24,8 @@ import com.pingcap.tikv.txn.TxnKVClient; import com.pingcap.tikv.util.ChannelFactory; import com.pingcap.tikv.util.ConcreteBackOffer; +import java.util.HashMap; +import java.util.Map; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.function.Function; @@ -44,7 +46,22 @@ public class TiSession implements AutoCloseable { private volatile RegionManager regionManager; private volatile RegionStoreClient.RegionStoreClientBuilder clientBuilder; - public TiSession(TiConfiguration conf) { + private static Map sessionCachedMap = new HashMap<>(); + + // Since we create session as singleton now, configuration change will not + // reflect change + public static synchronized TiSession getInstance(TiConfiguration conf) { + String key = conf.getPdAddrsString(); + if (sessionCachedMap.containsKey(key)) { + return sessionCachedMap.get(key); + } + + TiSession newSession = new TiSession(conf); + sessionCachedMap.put(key, newSession); + return newSession; + } + + private TiSession(TiConfiguration conf) { this.conf = conf; this.channelFactory = new ChannelFactory(conf.getMaxFrameSize()); this.regionManager = null; diff --git a/tikv-client/src/main/java/com/pingcap/tikv/TiSessionCache.java b/tikv-client/src/main/java/com/pingcap/tikv/TiSessionCache.java deleted file mode 100644 index 0e17feec6a..0000000000 --- a/tikv-client/src/main/java/com/pingcap/tikv/TiSessionCache.java +++ /dev/null @@ -1,25 +0,0 @@ -package com.pingcap.tikv; - -import java.util.HashMap; -import java.util.Map; - -public class TiSessionCache { - private static Map sessionCachedMap = new HashMap<>(); - - // Since we create session as singleton now, configuration change will not - // reflect change - public static synchronized TiSession getSession(TiConfiguration conf) { - String key = conf.getPdAddrsString(); - if (sessionCachedMap.containsKey(key)) { - return sessionCachedMap.get(key); - } - - TiSession newSession = TiSession.create(conf); - sessionCachedMap.put(key, newSession); - return newSession; - } - - public static void clear() { - sessionCachedMap.clear(); - } -} diff --git a/tikv-client/src/main/java/com/pingcap/tikv/TwoPhaseCommitter.java b/tikv-client/src/main/java/com/pingcap/tikv/TwoPhaseCommitter.java index 00bddeef2d..021a7dcdd4 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/TwoPhaseCommitter.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/TwoPhaseCommitter.java @@ -113,14 +113,14 @@ public byte[] getValue() { private final long lockTTL; public TwoPhaseCommitter(TiConfiguration conf, long startTime) { - this.kvClient = TiSessionCache.getSession(conf).createTxnClient(); + this.kvClient = TiSession.getInstance(conf).createTxnClient(); this.regionManager = kvClient.getRegionManager(); this.startTs = startTime; this.lockTTL = DEFAULT_BATCH_WRITE_LOCK_TTL; } public TwoPhaseCommitter(TiConfiguration conf, long startTime, long lockTTL) { - this.kvClient = TiSessionCache.getSession(conf).createTxnClient(); + this.kvClient = TiSession.getInstance(conf).createTxnClient(); this.regionManager = kvClient.getRegionManager(); this.startTs = startTime; this.lockTTL = lockTTL; diff --git a/tikv-client/src/main/java/com/pingcap/tikv/allocator/RowIDAllocator.java b/tikv-client/src/main/java/com/pingcap/tikv/allocator/RowIDAllocator.java index 70e5cddb2f..04097698c5 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/allocator/RowIDAllocator.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/allocator/RowIDAllocator.java @@ -19,7 +19,6 @@ import com.pingcap.tikv.Snapshot; import com.pingcap.tikv.TiConfiguration; import com.pingcap.tikv.TiSession; -import com.pingcap.tikv.TiSessionCache; import com.pingcap.tikv.TwoPhaseCommitter; import com.pingcap.tikv.codec.CodecDataInput; import com.pingcap.tikv.codec.CodecDataOutput; @@ -67,7 +66,7 @@ public long getEnd() { // set key value pair to tikv via two phase committer protocol. private void set(ByteString key, byte[] value) { - TiSession session = TiSessionCache.getSession(conf); + TiSession session = TiSession.getInstance(conf); TwoPhaseCommitter twoPhaseCommitter = new TwoPhaseCommitter(conf, session.getTimestamp().getVersion()); diff --git a/tikv-client/src/test/java/com/pingcap/tikv/TiConfigurationTest.java b/tikv-client/src/test/java/com/pingcap/tikv/TiConfigurationTest.java new file mode 100644 index 0000000000..81b028ac3d --- /dev/null +++ b/tikv-client/src/test/java/com/pingcap/tikv/TiConfigurationTest.java @@ -0,0 +1,38 @@ +/* + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.pingcap.tikv; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.List; +import org.junit.Test; + +public class TiConfigurationTest { + @Test + public void testListToString() { + List list = new ArrayList<>(); + + list.add("1"); + assertEquals(TiConfiguration.listToString(list), "[1]"); + + list.add("2"); + assertEquals(TiConfiguration.listToString(list), "[1,2]"); + + list.add("3"); + assertEquals(TiConfiguration.listToString(list), "[1,2,3]"); + } +} From 331455708378687655cf55118a91651cb22a0f37 Mon Sep 17 00:00:00 2001 From: Zhexuan Yang Date: Mon, 5 Aug 2019 19:42:15 +0800 Subject: [PATCH 20/62] bump grpc to 1.17 (#982) --- tikv-client/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tikv-client/pom.xml b/tikv-client/pom.xml index d367155c35..34e3c87f4a 100644 --- a/tikv-client/pom.xml +++ b/tikv-client/pom.xml @@ -21,7 +21,7 @@ 4.12 1.2.17 1.7.16 - 1.7.0 + 1.17.0 1.6.6 2.9.9 3.0.1 From a59c6f53e3031a73a2d020e591f5fb373b20a538 Mon Sep 17 00:00:00 2001 From: birdstorm Date: Tue, 6 Aug 2019 14:06:11 +0800 Subject: [PATCH 21/62] Add multiple-column PK tests (#970) --- .ci/integration_test.groovy | 7 +- .../datasource/BaseDataSourceTest.scala | 1 - .../spark/sql/BaseTestGenerationSpec.scala | 27 +++++ .../apache/spark/sql/BaseTiSparkTest.scala | 8 -- .../catalyst/catalog/CatalogTestSuite.scala | 1 + .../spark/sql/test/SharedSQLContext.scala | 15 ++- .../spark/sql/test/generator/Data.scala | 29 +++-- .../sql/test/generator/IndexColumn.scala | 24 ++-- .../spark/sql/test/generator/Schema.scala | 8 +- .../test/generator/TestDataGenerator.scala | 13 +- .../sql/test/generator/ValueGenerator.scala | 81 ++++++++----- .../BaseMultiColumnDataTypeGenerator.scala | 5 +- ...enerateMultiColumnDataTypeTestAction.scala | 13 +- .../sql/types/MultiColumnDataTypeSuite.scala | 16 ++- .../sql/types/MultiColumnDataTypeTest.scala | 25 ++-- .../RunMultiColumnDataTypeTestAction.scala | 2 +- ...erateMultiColumnPKDataTypeTestAction.scala | 112 ++++++++++++++++++ .../pk/MultiColumnDataTypePKGenerator.scala | 18 +++ .../pk/MultiColumnPKDataTypeSuite00.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite01.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite02.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite03.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite04.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite05.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite06.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite07.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite08.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite09.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite10.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite11.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite12.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite13.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite14.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite15.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite16.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite17.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite18.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite19.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite20.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite21.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite22.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite23.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite24.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite25.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite26.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite27.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite28.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite29.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite30.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite31.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite32.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite33.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite34.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuite35.scala | 24 ++++ .../pk/MultiColumnPKDataTypeSuites.scala | 98 +++++++++++++++ .../com/pingcap/tikv/row/ObjectRowImpl.java | 2 + 56 files changed, 1270 insertions(+), 99 deletions(-) create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/GenerateMultiColumnPKDataTypeTestAction.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnDataTypePKGenerator.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite00.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite01.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite02.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite03.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite04.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite05.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite06.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite07.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite08.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite09.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite10.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite11.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite12.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite13.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite14.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite15.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite16.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite17.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite18.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite19.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite20.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite21.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite22.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite23.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite24.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite25.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite26.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite27.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite28.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite29.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite30.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite31.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite32.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite33.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite34.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite35.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuites.scala diff --git a/.ci/integration_test.groovy b/.ci/integration_test.groovy index ca0362af51..2fd65a734f 100644 --- a/.ci/integration_test.groovy +++ b/.ci/integration_test.groovy @@ -90,12 +90,13 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb sh """ cp -R /home/jenkins/git/tispark/. ./ git checkout -f ${ghprbActualCommit} - find core/src -name '*Suite*' > test + find core/src -name '*Suite*' | grep -v 'MultiColumnPKDataTypeSuite' > test + shuf test -o test2 + mv test2 test + find core/src -name '*MultiColumnPKDataTypeSuite*' >> test sed -i 's/core\\/src\\/test\\/scala\\///g' test sed -i 's/\\//\\./g' test sed -i 's/\\.scala//g' test - shuf test -o test2 - mv test2 test split test -n r/$PARALLEL_NUMBER test_unit_ -a 2 --numeric-suffixes=1 """ diff --git a/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala b/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala index 04e7e60315..d7939dc638 100644 --- a/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala +++ b/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala @@ -26,7 +26,6 @@ class BaseDataSourceTest(val table: String, override def beforeAll(): Unit = { enableTidbConfigPropertiesInjectedToSpark = _enableTidbConfigPropertiesInjectedToSpark super.beforeAllWithoutLoadData() - tidbStmt = tidbConn.createStatement() initializeTimeZone() } diff --git a/core/src/test/scala/org/apache/spark/sql/BaseTestGenerationSpec.scala b/core/src/test/scala/org/apache/spark/sql/BaseTestGenerationSpec.scala index 3868f129d1..aafa8616c5 100644 --- a/core/src/test/scala/org/apache/spark/sql/BaseTestGenerationSpec.scala +++ b/core/src/test/scala/org/apache/spark/sql/BaseTestGenerationSpec.scala @@ -17,19 +17,46 @@ package org.apache.spark.sql +import org.apache.spark.sql.test.generator.DataType.ReflectedDataType + trait BaseTestGenerationSpec { protected val rowCount: Int protected val preDescription: String = "Generating Data for " + protected var cols: List[ReflectedDataType] = List.empty[ReflectedDataType] + def getTableName(dataTypes: String*): String def getTableNameWithDesc(desc: String, dataTypes: String*): String def getColumnName(dataType: String): String = s"col_$dataType" + def getColumnNameByOffset(offset: Int): String = { + assert( + cols.size > offset, + "column length incorrect, maybe `cols` is not initialized correctly?" + ) + val dataType = cols(offset) + val suffix = if (cols.count(_ == dataType) > 1) { + var cnt = 0 + for (i <- 0 until offset) { + if (cols(i) == dataType) { + cnt += 1 + } + } + s"$cnt" + } else { + "" + } + s"${getColumnName(dataType.toString)}$suffix" + } + def getIndexName(dataTypes: String*): String = s"idx_${dataTypes.map(getColumnName).mkString("_")}" + def getIndexNameByOffset(offsets: Int*): String = + s"idx_${offsets.map(getColumnNameByOffset).mkString("_")}" + } diff --git a/core/src/test/scala/org/apache/spark/sql/BaseTiSparkTest.scala b/core/src/test/scala/org/apache/spark/sql/BaseTiSparkTest.scala index 94ec6a73a8..b5e3eba1e1 100644 --- a/core/src/test/scala/org/apache/spark/sql/BaseTiSparkTest.scala +++ b/core/src/test/scala/org/apache/spark/sql/BaseTiSparkTest.scala @@ -31,8 +31,6 @@ import scala.collection.mutable.ArrayBuffer class BaseTiSparkTest extends QueryTest with SharedSQLContext { - protected var tidbStmt: Statement = _ - private val defaultTestDatabases: Seq[String] = Seq("tispark_test") protected var tableNames: Seq[String] = _ @@ -166,12 +164,6 @@ class BaseTiSparkTest extends QueryTest with SharedSQLContext { loadTestData() } - protected def initializeTimeZone(): Unit = { - tidbStmt = tidbConn.createStatement() - // Set default time zone to GMT-7 - tidbStmt.execute(s"SET time_zone = '$timeZoneOffset'") - } - protected case class TestTables(dbName: String, tables: String*) protected def refreshConnections(testTables: TestTables, isHiveEnabled: Boolean = false): Unit = { diff --git a/core/src/test/scala/org/apache/spark/sql/catalyst/catalog/CatalogTestSuite.scala b/core/src/test/scala/org/apache/spark/sql/catalyst/catalog/CatalogTestSuite.scala index c889b046b4..b8d5cf289f 100644 --- a/core/src/test/scala/org/apache/spark/sql/catalyst/catalog/CatalogTestSuite.scala +++ b/core/src/test/scala/org/apache/spark/sql/catalyst/catalog/CatalogTestSuite.scala @@ -95,6 +95,7 @@ class CatalogTestSuite extends BaseTiSparkTest { spark.sql("desc extended full_data_type_table").explain() spark.sql("desc extended full_data_type_table").show(200, truncate = false) spark.sql("desc formatted full_data_type_table").show(200, truncate = false) + refreshConnections(true) setCurrentDatabase("default") spark.sql("drop table if exists t") spark.sql("create table t(a int)") diff --git a/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala b/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala index d74f67cf0a..3c86c437c7 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala @@ -45,6 +45,8 @@ trait SharedSQLContext extends SparkFunSuite with Eventually with BeforeAndAfter protected def ti: TiContext = SharedSQLContext.ti + protected def tidbStmt: Statement = SharedSQLContext.tidbStmt + protected def tidbConn: Connection = SharedSQLContext.tidbConn protected def tidbOptions: Map[String, String] = SharedSQLContext.tidbOptions @@ -67,6 +69,8 @@ trait SharedSQLContext extends SparkFunSuite with Eventually with BeforeAndAfter protected def initStatistics(): Unit = SharedSQLContext.initStatistics() + protected def initializeTimeZone(): Unit = SharedSQLContext.initializeTimeZone() + protected def defaultTimeZone: TimeZone = SharedSQLContext.timeZone protected def refreshConnections(): Unit = SharedSQLContext.refreshConnections(false) @@ -232,6 +236,12 @@ object SharedSQLContext extends Logging { logger.info("Analyzing table finished.") } + protected def initializeTimeZone(): Unit = { + _statement = _tidbConnection.createStatement() + // Set default time zone to GMT-7 + _statement.execute(s"SET time_zone = '$timeZoneOffset'") + } + protected def loadSQLFile(directory: String, file: String): Unit = { val fullFileName = s"$directory/$file.sql" try { @@ -241,7 +251,7 @@ object SharedSQLContext extends Logging { val queryString = source.mkString source.close() _tidbConnection.setCatalog("mysql") - _statement = _tidbConnection.createStatement() + initializeTimeZone() _statement.execute(queryString) logger.info(s"Load $fullFileName successfully.") } catch { @@ -273,7 +283,8 @@ object SharedSQLContext extends Logging { jdbcUrl = s"jdbc:mysql://address=(protocol=tcp)(host=$tidbAddr)(port=$tidbPort)/?user=$tidbUser&password=$tidbPassword" + s"&useUnicode=true&characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull&useSSL=false" + - s"&rewriteBatchedStatements=true&autoReconnect=true&failOverReadOnly=false&maxReconnects=10" + s"&rewriteBatchedStatements=true&autoReconnect=true&failOverReadOnly=false&maxReconnects=10" + + s"&allowMultiQueries=true&serverTimezone=${timeZone.getDisplayName}" _tidbConnection = TiDBUtils.createConnectionFactory(jdbcUrl)() _statement = _tidbConnection.createStatement() diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/Data.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/Data.scala index a91974790c..70e6a075fa 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/Data.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/Data.scala @@ -46,21 +46,20 @@ case class Data(schema: Schema, data: List[TiRow], directory: String) { case null => null case _: Boolean => value.toString case _: Number => value.toString - case _: Array[Byte] => - s"X\'${value - .asInstanceOf[Array[Byte]] - .map { b => - String.format("%02x", new java.lang.Byte(b)) - } - .mkString}\'" - case _: Array[Boolean] => - s"b\'${value - .asInstanceOf[Array[Boolean]] - .map { - case true => "1" - case false => "0" - } - .mkString}\'" + case arr: Array[Byte] => + s"X\'${arr.map { b => + String.format("%02x", new java.lang.Byte(b)) + }.mkString}\'" + case arr: Array[Boolean] => + s"b\'${arr.map { + case true => "1" + case false => "0" + }.mkString}\'" + case ts: java.sql.Timestamp => + // convert to Timestamp output with current TimeZone + val zonedDateTime = ts.toLocalDateTime.atZone(java.util.TimeZone.getDefault.toZoneId) + val milliseconds = zonedDateTime.toEpochSecond * 1000L + zonedDateTime.getNano / 1000000 + s"\'${new java.sql.Timestamp(milliseconds)}\'" case _ => s"\'$value\'" } diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/IndexColumn.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/IndexColumn.scala index 815f246244..c629d70728 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/IndexColumn.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/IndexColumn.scala @@ -39,13 +39,15 @@ case class ColumnInfo(columnName: String, val isPrimaryKey: Boolean = desc.contains("primary key") val nullable: Boolean = !isPrimaryKey && !desc.contains("not null") - val unsigned: Boolean = desc.contains("unsigned") - val noDefault: Boolean = !desc.contains("default") + + private val breakDown = desc.split(" ") + val unsigned: Boolean = breakDown.contains("unsigned") + val noDefault: Boolean = !breakDown.contains("default") + val isUnique: Boolean = breakDown.contains("unique") val default: String = { if (noDefault) { null } else { - val breakDown = desc.split(" ") val idx = breakDown.indexOf("default") assert(idx >= 0) if (idx == breakDown.length - 1) { @@ -76,11 +78,19 @@ case class ColumnInfo(columnName: String, } val generator: ValueGenerator = - ValueGenerator(dataType, len, decimal, nullable, unsigned, noDefault, default, isPrimaryKey) + ValueGenerator( + dataType, + len, + decimal, + nullable, + unsigned, + noDefault, + default, + isPrimaryKey, + isUnique + ) - override def toString: String = { - "`" + columnName + "` " + s"${generator.toString}" - } + override def toString: String = s"`$columnName` ${generator.toString}" } case class IndexColumnInfo(column: String, length: Integer) { diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/Schema.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/Schema.scala index f267444b0e..2081cac437 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/Schema.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/Schema.scala @@ -47,13 +47,11 @@ case class Schema(database: String, assert(indexInfo.count(_.isPrimary) <= 1, "more than one primary key exist in schema") - private val pkIndexInfo = indexInfo.filter(_.isPrimary) - private val pkColumnName = if (pkIndexInfo.isEmpty) { + val pkIndexInfo = indexInfo.filter(_.isPrimary) + val pkColumnName: String = if (pkIndexInfo.isEmpty) { "" - } else if (pkIndexInfo.head.indexColumns.size == 1) { - pkIndexInfo.head.indexColumns.head.column } else { - throw new IllegalArgumentException("Multi-column Primary key/Unique index not supported yet") + pkIndexInfo.head.indexColumns.map(_.column).mkString(",") } val columnInfo: List[ColumnInfo] = columnNames.map { col => diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/TestDataGenerator.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/TestDataGenerator.scala index e21c17a9fb..1622e5de9b 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/TestDataGenerator.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/TestDataGenerator.scala @@ -61,9 +61,11 @@ object TestDataGenerator { val dateAndDateTime: List[ReflectedDataType] = timestamps ::: dates ::: durations ::: years val stringAndBinaries: List[ReflectedDataType] = strings ::: binaries + val charCharset: List[ReflectedDataType] = strings ::: texts + val binaryCharset: List[ReflectedDataType] = binaries ::: bytes // TODO: support enum and set https://github.com/pingcap/tispark/issues/946 // val stringType: List[DataType] = texts ::: strings ::: binaries ::: enums ::: sets - val stringType: List[ReflectedDataType] = texts ::: strings ::: binaries ::: bytes + val stringType: List[ReflectedDataType] = charCharset ::: binaryCharset val varString: List[ReflectedDataType] = List(VARCHAR, VARBINARY) val unsignedType: List[ReflectedDataType] = numeric @@ -100,6 +102,8 @@ object TestDataGenerator { def isNumeric(dataType: ReflectedDataType): Boolean = numeric.contains(dataType) def isStringType(dataType: ReflectedDataType): Boolean = stringType.contains(dataType) def isVarString(dataType: ReflectedDataType): Boolean = varString.contains(dataType) + def isCharCharset(dataType: ReflectedDataType): Boolean = charCharset.contains(dataType) + def isBinaryCharset(dataType: ReflectedDataType): Boolean = binaryCharset.contains(dataType) def isCharOrBinary(dataType: ReflectedDataType): Boolean = stringAndBinaries.contains(dataType) def getLength(dataType: TiDataType): Long = @@ -279,8 +283,11 @@ object TestDataGenerator { } def hash(value: Any): String = value match { - case null => "null" - case b: Array[Byte] => b.mkString("[", ",", "]") + case null => "null" + case b: Array[Byte] => b.mkString("[", ",", "]") + case t: java.sql.Timestamp => + // timestamp was indexed as Integer when treated as unique key + s"${t.getTime / 1000}" case list: List[Any] => val ret = StringBuilder.newBuilder ret ++= "(" diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/ValueGenerator.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/ValueGenerator.scala index 7af9bd6935..3c7a16027e 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/ValueGenerator.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/ValueGenerator.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.test.generator import org.apache.spark.sql.test.generator.DataType._ -import org.apache.spark.sql.test.generator.TestDataGenerator.{checkUnique, getLength, isNumeric, isStringType} +import org.apache.spark.sql.test.generator.TestDataGenerator.{checkUnique, getLength, isBinaryCharset, isCharCharset, isNumeric} import scala.collection.mutable import scala.util.Random @@ -30,7 +30,8 @@ case class ValueGenerator(dataType: ReflectedDataType, isUnsigned: Boolean = false, noDefault: Boolean = false, default: Any = null, - isPrimaryKey: Boolean = false) { + isPrimaryKey: Boolean = false, + isUnique: Boolean = false) { private val flag: Int = { import com.pingcap.tikv.types.DataType._ @@ -39,6 +40,9 @@ case class ValueGenerator(dataType: ReflectedDataType, ret |= PriKeyFlag ret |= NotNullFlag } + if (isUnique) { + ret |= UniqueKeyFlag + } if (!nullable) { ret |= NotNullFlag } @@ -51,6 +55,8 @@ case class ValueGenerator(dataType: ReflectedDataType, ret } + private val generateUnique = isPrimaryKey || isUnique + import com.pingcap.tikv.meta.Collation._ val tiDataType: TiDataType = getType(dataType, flag, M, D, "", DEF_COLLATION_CODE) @@ -98,15 +104,17 @@ case class ValueGenerator(dataType: ReflectedDataType, } } - private val specialBound: List[String] = { - val list: List[String] = dataType match { - case BIT => List("b\'\'", "\'\'") - case TINYINT | SMALLINT | MEDIUMINT | INT | BIGINT if !tiDataType.isUnsigned => List("-1") - case _ if isStringType(dataType) => List("") + private val specialBound: List[Any] = { + val list: List[Any] = dataType match { + case BIT => List(Array[Byte]()) + case TINYINT | SMALLINT | MEDIUMINT | INT | BIGINT if !tiDataType.isUnsigned => List(-1L) + case TIMESTAMP => List(new java.sql.Timestamp(1000)) + case _ if isCharCharset(dataType) => List("") + case _ if isBinaryCharset(dataType) => List(Array[Byte]()) case _ => List.empty[String] } if (lowerBound != null && upperBound != null) { - list ::: List(lowerBound.toString, upperBound.toString) + list ::: List(lowerBound, upperBound) } else { list } @@ -137,48 +145,56 @@ case class ValueGenerator(dataType: ReflectedDataType, dataType match { case BIT => val bit: Array[Boolean] = new Array[Boolean](tiDataType.getLength.toInt) - bit.map(_ => r.nextBoolean()) + bit.map(_ => r.nextBoolean) case BOOLEAN => r.nextInt(1 << 1) case TINYINT => r.nextInt(1 << 8) case SMALLINT => r.nextInt(1 << 16) case MEDIUMINT => r.nextInt(1 << 24) - case INT => r.nextInt() + (1L << 31) - case BIGINT => toUnsignedBigInt(r.nextLong()) - case FLOAT => Math.abs(r.nextFloat()) - case DOUBLE => Math.abs(r.nextDouble()) + case INT => r.nextInt + (1L << 31) + case BIGINT => toUnsignedBigInt(r.nextLong) + case FLOAT => Math.abs(r.nextFloat) + case DOUBLE => Math.abs(r.nextDouble) case DECIMAL => val len = getLength(tiDataType) val decimal = if (tiDataType.isDecimalUnSpecified) 0 else tiDataType.getDecimal (BigDecimal.apply(Math.abs(r.nextLong()) % Math.pow(10, len)) / BigDecimal.apply( Math.pow(10, decimal) - )).bigDecimal.toPlainString + )).bigDecimal } } else { dataType match { case BIT => val bit: Array[Boolean] = new Array[Boolean](tiDataType.getLength.toInt) - bit.map(_ => r.nextBoolean()) + bit.map(_ => r.nextBoolean) case BOOLEAN => r.nextInt(1 << 1) case TINYINT => r.nextInt(1 << 8) - (1 << 7) case SMALLINT => r.nextInt(1 << 16) - (1 << 15) case MEDIUMINT => r.nextInt(1 << 24) - (1 << 23) - case INT => r.nextInt() - case BIGINT => r.nextLong() - case FLOAT => r.nextFloat() - case DOUBLE => r.nextDouble() + case INT => r.nextInt + case BIGINT => r.nextLong + case FLOAT => r.nextFloat + case DOUBLE => r.nextDouble case DECIMAL => val len = getLength(tiDataType) val decimal = if (tiDataType.isDecimalUnSpecified) 0 else tiDataType.getDecimal - (BigDecimal.apply(r.nextLong() % Math.pow(10, len)) / BigDecimal.apply( + (BigDecimal.apply(r.nextLong % Math.pow(10, len)) / BigDecimal.apply( Math.pow(10, decimal) - )).bigDecimal.toPlainString + )).bigDecimal case VARCHAR => generateRandomString(r, tiDataType.getLength) case VARBINARY => generateRandomBinary(r, tiDataType.getLength) case CHAR | TEXT | TINYTEXT | MEDIUMTEXT | LONGTEXT => generateRandomString(r, getRandomLength(dataType, r)) case BINARY | BLOB | TINYBLOB | MEDIUMBLOB | LONGBLOB => generateRandomBinary(r, getRandomLength(dataType, r)) - case _ => throw new RuntimeException("not supported yet") + case DATE => + // start from 1000-01-01 to 9999-01-01 + val milliseconds = -30610253143000L + (Math.abs(r.nextLong) % (9000L * 365 * 24 * 60 * 60 * 1000)) + new java.sql.Date(milliseconds) + case TIMESTAMP => + // start from 1970-01-01 00:00:01 to 2038-01-19 03:14:07 + val milliseconds = Math.abs(r.nextInt * 1000L + 1000L) + Math.abs(r.nextInt(1000)) + new java.sql.Timestamp(milliseconds) + case _ => throw new RuntimeException(s"random $dataType generator not supported yet") } } } @@ -209,9 +225,10 @@ case class ValueGenerator(dataType: ReflectedDataType, // pre-generate n random values def preGenerateRandomValues(r: Random, n: Long): Unit = { if (n <= 1e6) { - generatedRandomValues = if (isPrimaryKey) { + generatedRandomValues = if (generateUnique) { + assert(n <= rangeSize, "random generator cannot generate unique value less than available") val set: mutable.Set[Any] = mutable.HashSet.empty[Any] - set += specialBound + set += specialBound.map(TestDataGenerator.hash) (0L until n - specialBound.size).map { _ => randomUniqueValue(r, set) }.toList ++ specialBound @@ -220,7 +237,10 @@ case class ValueGenerator(dataType: ReflectedDataType, randomValue(r) }.toList ++ specialBound } - assert(generatedRandomValues.size == n) + assert( + generatedRandomValues.size >= n, + s"Generate values size=$generatedRandomValues less than n=$n" + ) curPos = 0 } } @@ -231,7 +251,7 @@ case class ValueGenerator(dataType: ReflectedDataType, null } else { if (generatedRandomValues.isEmpty) { - if (isPrimaryKey) { + if (generateUnique) { val set: mutable.Set[Any] = mutable.HashSet.empty[Any] randomUniqueValue(r, set) } else { @@ -251,9 +271,13 @@ case class ValueGenerator(dataType: ReflectedDataType, "Values not pre-generated, please generate values first to use next()" ) assert( - hasNext, + hasNext || !generateUnique, s"Generated random values(${generatedRandomValues.size}) is less than needed(${curPos + 1})." ) + if (!hasNext) { + // reuse previous generated data + curPos = 0 + } curPos += 1 generatedRandomValues(curPos - 1) } @@ -279,7 +303,8 @@ case class ValueGenerator(dataType: ReflectedDataType, val nullString = if (!nullable) " not null" else "" val defaultString = if (!noDefault) s" default $default" else "" val unsignedString = if (isUnsigned) " unsigned" else "" - s"$unsignedString$nullString$defaultString" + val uniqueString = if (isUnique) " unique" else "" + s"$unsignedString$nullString$uniqueString$defaultString" } override def toString: String = { diff --git a/core/src/test/scala/org/apache/spark/sql/types/BaseMultiColumnDataTypeGenerator.scala b/core/src/test/scala/org/apache/spark/sql/types/BaseMultiColumnDataTypeGenerator.scala index ef638906f3..87ab88c6e7 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/BaseMultiColumnDataTypeGenerator.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/BaseMultiColumnDataTypeGenerator.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.types import org.apache.spark.sql.BaseTiSparkTest -import org.apache.spark.sql.test.generator.DataType.{getTypeName, ReflectedDataType} +import org.apache.spark.sql.test.generator.DataType.ReflectedDataType case class BaseMultiColumnDataTypeGenerator(dataTypes: List[ReflectedDataType], unsignedDataTypes: List[ReflectedDataType], @@ -27,8 +27,7 @@ case class BaseMultiColumnDataTypeGenerator(dataTypes: List[ReflectedDataType], testDesc: String) extends BaseTiSparkTest with GenerateMultiColumnDataTypeTestAction { - def loadTestData(dataTypes: List[ReflectedDataType]): Unit = { - val tableName = getTableName(dataTypes.map(getTypeName): _*) + def loadTestData(tableName: String): Unit = { logger.info(s"${preDescription}Test $tableName - $testDesc") loadSQLFile(dataTypeTestDir, tableName) } diff --git a/core/src/test/scala/org/apache/spark/sql/types/GenerateMultiColumnDataTypeTestAction.scala b/core/src/test/scala/org/apache/spark/sql/types/GenerateMultiColumnDataTypeTestAction.scala index 6ccb3f79fa..c967e9f6b2 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/GenerateMultiColumnDataTypeTestAction.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/GenerateMultiColumnDataTypeTestAction.scala @@ -28,7 +28,7 @@ trait GenerateMultiColumnDataTypeTestAction override val rowCount = 50 - private def toString(dataTypes: Seq[String]): String = dataTypes.hashCode().toString + private def toString(dataTypes: Seq[String]): String = Math.abs(dataTypes.hashCode()).toString override def getTableName(dataTypes: String*): String = s"test_${toString(dataTypes)}" @@ -59,8 +59,7 @@ trait GenerateMultiColumnDataTypeTestAction } } - def init(): Unit = { - val tableName = getTableName(dataTypes.map(getTypeName): _*) + def init(tableName: String): Unit = { val dataTypesWithDescription = dataTypes.map { dataType => val len = genLen(dataType) (dataType, len, "") @@ -70,10 +69,12 @@ trait GenerateMultiColumnDataTypeTestAction data.save() } - def loadTestData(dataTypes: List[ReflectedDataType]): Unit + def loadTestData(tableName: String): Unit def test(): Unit = { - init() - loadTestData(dataTypes) + cols = dataTypes + val tableName = getTableName(dataTypes.map(getTypeName): _*) + init(tableName) + loadTestData(tableName) } } diff --git a/core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeSuite.scala b/core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeSuite.scala index c24e5220aa..d000571446 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeSuite.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeSuite.scala @@ -37,8 +37,20 @@ class MultiColumnDataTypeSuite testDesc ) - def startTest(dataTypes: List[ReflectedDataType]): Unit = { - simpleSelect(database, dataTypes: _*) + override def startTest(dataTypes: List[ReflectedDataType]): Unit = { + val typeNames = dataTypes.map(getTypeName) + val tblName = generator.getTableName(typeNames: _*) + val columnNames = typeNames.zipWithIndex.map { x => + generator.getColumnNameByOffset(x._2) + } + for (i <- columnNames.indices) { + val col1 = columnNames(i) + for (j <- i + 1 until columnNames.size) { + val col2 = columnNames(j) + val dataType2 = dataTypes(j) + simpleSelect(database, tblName, col1, col2, dataType2) + } + } } def check(): Unit = { diff --git a/core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeTest.scala b/core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeTest.scala index af4cef6e6f..f8150b8575 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeTest.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeTest.scala @@ -52,21 +52,16 @@ trait MultiColumnDataTypeTest extends BaseTiSparkTest { } } - def simpleSelect(dbName: String, dataTypes: ReflectedDataType*): Unit = { - val typeNames = dataTypes.map(getTypeName) - val tblName = generator.getTableName(typeNames: _*) - val columnNames = typeNames.map(generator.getColumnName) - for (i <- columnNames.indices) { - for (j <- i + 1 until columnNames.size) { - val col = columnNames(j) - val types = dataTypes(j) - for ((op, value) <- getOperations(types)) { - val query = s"select ${columnNames(i)} from $tblName where $col $op $value" - test(query) { - setCurrentDatabase(dbName) - runTest(query) - } - } + def simpleSelect(dbName: String, + tableName: String, + col1: String, + col2: String, + dataType: ReflectedDataType): Unit = { + for ((op, value) <- getOperations(dataType)) { + val query = s"select $col1 from $tableName where $col2 $op $value" + test(query) { + setCurrentDatabase(dbName) + runTest(query) } } } diff --git a/core/src/test/scala/org/apache/spark/sql/types/RunMultiColumnDataTypeTestAction.scala b/core/src/test/scala/org/apache/spark/sql/types/RunMultiColumnDataTypeTestAction.scala index 1debe17094..52b328f7e7 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/RunMultiColumnDataTypeTestAction.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/RunMultiColumnDataTypeTestAction.scala @@ -21,7 +21,7 @@ import org.apache.spark.sql.test.generator.DataType.ReflectedDataType trait RunMultiColumnDataTypeTestAction extends MultiColumnDataTypeTestSpec { - def startTest(dataTypes: List[ReflectedDataType]): Unit + def startTest(dataTypes: List[ReflectedDataType]): Unit = ??? def test(): Unit = { startTest(dataTypes) diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/GenerateMultiColumnPKDataTypeTestAction.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/GenerateMultiColumnPKDataTypeTestAction.scala new file mode 100644 index 0000000000..3999495416 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/GenerateMultiColumnPKDataTypeTestAction.scala @@ -0,0 +1,112 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +import org.apache.spark.sql.test.generator.DataType.{getTypeName, ReflectedDataType} +import org.apache.spark.sql.test.generator.TestDataGenerator.{isStringType, randomDataGenerator, schemaGenerator} +import org.apache.spark.sql.test.generator._ +import org.apache.spark.sql.types.GenerateMultiColumnDataTypeTestAction + +import scala.collection.mutable +import scala.util.Random + +trait GenerateMultiColumnPKDataTypeTestAction extends GenerateMultiColumnDataTypeTestAction { + + override val rowCount: Int = 10 + + private val dataTypesWithDescription = dataTypes.map { genDescription } + + private def genIndex(dataTypesWithDescription: List[(ReflectedDataType, String, String)], + r: Random): List[Index] = { + assert( + dataTypesWithDescription.size >= 2, + "column size should be at least 2 for multi-column tests" + ) + val result: mutable.ListBuffer[IndexColumn] = new mutable.ListBuffer[IndexColumn]() + for (i <- 0 until 2) { + val d = dataTypesWithDescription(i)._1 + if (isStringType(d)) { + result += PrefixColumn(i + 1, r.nextInt(4) + 2) + } else { + result += DefaultColumn(i + 1) + } + } + List(PrimaryKey(result.toList)) + } + + override def genSchema( + tableName: String, + dataTypesWithDescription: List[(ReflectedDataType, String, String)] + ): Schema = { + val index = genIndex(dataTypesWithDescription, r) + schemaGenerator( + database, + tableName, + r, + dataTypesWithDescription, + index + ) + } + + override def genData(schema: Schema): Data = { + val pk = schema.pkColumnName.split(",", -1) + assert( + pk.nonEmpty && pk.head.nonEmpty, + "Schema incorrect for PK tests, must contain valid PK info" + ) + val cnt: Int = Math.min( + (schema.pkIndexInfo.head.indexColumns.map { + case b if b.column.contains("col_bit") => if (b.length == null) 2 else 1 << b.length.toInt + case b if b.column.contains("col_boolean") => 2 + case i if i.column.contains("col_tinyint") => 256 + case _ => 500 + }.product + 2) / 3, + rowCount + ) + assert(cnt > 0, "row count should be greater than 0") + randomDataGenerator(schema, cnt, dataTypeTestDir, r) + } + + def genDescription(dataType: ReflectedDataType): (ReflectedDataType, String, String) = { + val len = genLen(dataType) + (dataType, len, "") + } + + def genDescriptionNotNullable( + dataType: ReflectedDataType + ): (ReflectedDataType, String, String) = { + val len = genLen(dataType) + (dataType, len, "not null") + } + + def init(tableName: String, i: Int, j: Int): Unit = { + val schema = genSchema( + tableName, + List(genDescriptionNotNullable(dataTypes(i)), genDescriptionNotNullable(dataTypes(j))) ++ dataTypesWithDescription + ) + val data = genData(schema) + data.save() + } + + def test(i: Int, j: Int): Unit = { + cols = List(dataTypes(i), dataTypes(j)) ++ dataTypes + val tableName = getTableName(cols.map(getTypeName): _*) + init(tableName, i, j) + loadTestData(tableName) + } +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnDataTypePKGenerator.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnDataTypePKGenerator.scala new file mode 100644 index 0000000000..7566ee0f09 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnDataTypePKGenerator.scala @@ -0,0 +1,18 @@ +package org.apache.spark.sql.types.pk + +import org.apache.spark.sql.BaseTiSparkTest +import org.apache.spark.sql.test.generator.DataType.ReflectedDataType + +case class MultiColumnDataTypePKGenerator(dataTypes: List[ReflectedDataType], + unsignedDataTypes: List[ReflectedDataType], + dataTypeTestDir: String, + database: String, + testDesc: String) + extends BaseTiSparkTest + with GenerateMultiColumnPKDataTypeTestAction { + + def loadTestData(tableName: String): Unit = { + logger.info(s"${preDescription}Test $tableName - $testDesc") + loadSQLFile(dataTypeTestDir, tableName) + } +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite00.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite00.scala new file mode 100644 index 0000000000..2d9abdd2a3 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite00.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite00 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite01.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite01.scala new file mode 100644 index 0000000000..cc7c7e82d0 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite01.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite01 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite02.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite02.scala new file mode 100644 index 0000000000..77b2f03911 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite02.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite02 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite03.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite03.scala new file mode 100644 index 0000000000..70444da17c --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite03.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite03 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite04.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite04.scala new file mode 100644 index 0000000000..1c4d935c11 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite04.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite04 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite05.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite05.scala new file mode 100644 index 0000000000..a46968f038 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite05.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite05 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite06.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite06.scala new file mode 100644 index 0000000000..aacdf8c78c --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite06.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite06 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite07.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite07.scala new file mode 100644 index 0000000000..1a5633ba96 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite07.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite07 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite08.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite08.scala new file mode 100644 index 0000000000..6454db98a9 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite08.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite08 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite09.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite09.scala new file mode 100644 index 0000000000..4f9f326a9a --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite09.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite09 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite10.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite10.scala new file mode 100644 index 0000000000..cb819bfe7d --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite10.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite10 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite11.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite11.scala new file mode 100644 index 0000000000..f96e216bad --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite11.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite11 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite12.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite12.scala new file mode 100644 index 0000000000..e6a985621a --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite12.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite12 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite13.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite13.scala new file mode 100644 index 0000000000..bc70ef070e --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite13.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite13 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite14.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite14.scala new file mode 100644 index 0000000000..db8e3aad7e --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite14.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite14 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite15.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite15.scala new file mode 100644 index 0000000000..0801af0871 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite15.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite15 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite16.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite16.scala new file mode 100644 index 0000000000..99d302bfda --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite16.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite16 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite17.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite17.scala new file mode 100644 index 0000000000..32ce47628f --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite17.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite17 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite18.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite18.scala new file mode 100644 index 0000000000..b3d226edcf --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite18.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite18 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite19.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite19.scala new file mode 100644 index 0000000000..132e28d6f8 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite19.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite19 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite20.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite20.scala new file mode 100644 index 0000000000..910be9c77c --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite20.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite20 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite21.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite21.scala new file mode 100644 index 0000000000..e3f3a5ab6b --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite21.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite21 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite22.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite22.scala new file mode 100644 index 0000000000..c2d54962fa --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite22.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite22 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite23.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite23.scala new file mode 100644 index 0000000000..26800079be --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite23.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite23 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite24.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite24.scala new file mode 100644 index 0000000000..cf8c100d74 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite24.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite24 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite25.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite25.scala new file mode 100644 index 0000000000..d83fa35281 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite25.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite25 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite26.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite26.scala new file mode 100644 index 0000000000..0f3177fecc --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite26.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite26 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite27.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite27.scala new file mode 100644 index 0000000000..11dea5ed2e --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite27.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite27 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite28.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite28.scala new file mode 100644 index 0000000000..94a0249abe --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite28.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite28 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite29.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite29.scala new file mode 100644 index 0000000000..5871f3fc0a --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite29.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite29 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite30.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite30.scala new file mode 100644 index 0000000000..5e7891091d --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite30.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite30 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite31.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite31.scala new file mode 100644 index 0000000000..9a12eff888 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite31.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite31 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite32.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite32.scala new file mode 100644 index 0000000000..0e720c9e08 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite32.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite32 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite33.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite33.scala new file mode 100644 index 0000000000..caf4f1ac8f --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite33.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite33 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite34.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite34.scala new file mode 100644 index 0000000000..54f2438eb7 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite34.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite34 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite35.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite35.scala new file mode 100644 index 0000000000..044c7ad2a9 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuite35.scala @@ -0,0 +1,24 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +class MultiColumnPKDataTypeSuite35 extends MultiColumnPKDataTypeSuites { + override val currentTest: Seq[(Int, Int)] = tests(getId) + + test() +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuites.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuites.scala new file mode 100644 index 0000000000..af5296a3be --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuites.scala @@ -0,0 +1,98 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.spark.sql.types.pk + +import org.apache.spark.sql.test.generator.DataType.{getTypeName, BIGINT, INT, ReflectedDataType} +import org.apache.spark.sql.test.generator.TestDataGenerator._ +import org.apache.spark.sql.types.{MultiColumnDataTypeTest, RunMultiColumnDataTypeTestAction} + +trait MultiColumnPKDataTypeSuites + extends MultiColumnDataTypeTest + with RunMultiColumnDataTypeTestAction { + val dataTypes: List[ReflectedDataType] = baseDataTypes + val unsignedDataTypes: List[ReflectedDataType] = List(INT, BIGINT) + val dataTypeTestDir: String = "multi-column-dataType-test-pk" + val database: String = "multi_column_pk_data_type_test" + val testDesc: String = "Base test for multi-column pk data types" + + override val generator = MultiColumnDataTypePKGenerator( + dataTypes, + unsignedDataTypes, + dataTypeTestDir, + database, + testDesc + ) + + def startTest(dataTypes: List[ReflectedDataType], i: Int, j: Int): Unit = { + val dt = List(dataTypes(i), dataTypes(j)) ++ dataTypes + val tableName = generator.getTableName(dt.map(getTypeName): _*) + val typeNames = dt.map(getTypeName) + val columnNames = typeNames.zipWithIndex.map { x => + generator.getColumnNameByOffset(x._2) + } + for (u <- dt.indices) { + val col1 = columnNames(u) + for (v <- u + 1 until dt.size) { + val col2 = columnNames(v) + val dataType = dt(v) + simpleSelect(database, tableName, col1, col2, dataType) + } + } + } + + def check(i: Int, j: Int): Unit = { + if (generateData) { + generator.test(i, j) + } + } + + def test(i: Int, j: Int): Unit = { + startTest(dataTypes, i, j) + } + + val tests: Map[Int, Seq[(Int, Int)]] = { + val size = dataTypes.size - 1 + dataTypes.indices + .flatten { i => + dataTypes.indices + .filter { j => + i != j + } + .map { j => + (i, j) + } + } + .groupBy { + case (i, j) => + (i * size + (if (i > j) j else j - 1)) % 36 + } + .withDefaultValue(Seq.empty[(Int, Int)]) + } + + val currentTest: Seq[(Int, Int)] + + def getId: Int = getClass.getName.substring(getClass.getName.length - 2).toInt + + override def test(): Unit = { + currentTest.foreach { + case (i, j) => + check(i, j) + test(i, j) + } + } +} diff --git a/tikv-client/src/main/java/com/pingcap/tikv/row/ObjectRowImpl.java b/tikv-client/src/main/java/com/pingcap/tikv/row/ObjectRowImpl.java index 7e9ba98034..07b0e87042 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/row/ObjectRowImpl.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/row/ObjectRowImpl.java @@ -190,6 +190,8 @@ public String toString() { builder.append("["); builder.append(KeyUtils.formatBytes(((byte[]) values[i]))); builder.append("]"); + } else if (values[i] instanceof BigDecimal) { + builder.append(((BigDecimal) values[i]).toPlainString()); } else { builder.append(values[i]); } From 93128b8ad66e0cc02df4b8574a29523ef18e6afe Mon Sep 17 00:00:00 2001 From: birdstorm Date: Tue, 6 Aug 2019 18:23:05 +0800 Subject: [PATCH 22/62] add retry for batchGet (#986) --- .../tikv/region/RegionStoreClient.java | 68 ++++++++++++------- .../java/com/pingcap/tikv/MockServerTest.java | 8 ++- 2 files changed, 51 insertions(+), 25 deletions(-) diff --git a/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java b/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java index a270968bb4..bad2e7bcd1 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java @@ -38,6 +38,7 @@ import io.grpc.ManagedChannel; import java.util.*; import java.util.function.Supplier; +import java.util.stream.Collectors; import org.apache.log4j.Logger; import org.tikv.kvproto.Coprocessor; import org.tikv.kvproto.Coprocessor.KeyRange; @@ -166,28 +167,50 @@ private boolean isGetSuccess(BackOffer backOffer, GetResponse resp) return true; } - // TODO: batch get should consider key range split - public List batchGet(BackOffer backOffer, Iterable keys, long version) { - Supplier request = - () -> - BatchGetRequest.newBuilder() - .setContext(region.getContext()) - .addAllKeys(keys) - .setVersion(version) - .build(); - KVErrorHandler handler = - new KVErrorHandler<>( - regionManager, - this, - region, - resp -> resp.hasRegionError() ? resp.getRegionError() : null); - BatchGetResponse resp = - callWithRetry(backOffer, TikvGrpc.METHOD_KV_BATCH_GET, request, handler); - return doBatchGet(resp, backOffer); + public List batchGet(BackOffer backOffer, List keys, long version) { + List result = new ArrayList<>(); + while (true) { + // re-split keys + Map> map = + keys.stream().collect(Collectors.groupingBy(regionManager::getRegionByKey)); + boolean ok = true; + for (Map.Entry> entry : map.entrySet()) { + TiRegion newRegion = entry.getKey(); + if (!newRegion.equals(region)) { + RegionStoreClient newRegionStoreClient = + new RegionStoreClientBuilder(conf, this.channelFactory, this.regionManager) + .build(newRegion); + result.addAll(newRegionStoreClient.batchGet(backOffer, entry.getValue(), version)); + } else { + Supplier request = + () -> + BatchGetRequest.newBuilder() + .setContext(region.getContext()) + .addAllKeys(entry.getValue()) + .setVersion(version) + .build(); + KVErrorHandler handler = + new KVErrorHandler<>( + regionManager, + this, + region, + resp -> resp.hasRegionError() ? resp.getRegionError() : null); + BatchGetResponse resp = + callWithRetry(backOffer, TikvGrpc.METHOD_KV_BATCH_GET, request, handler); + if (isBatchGetSuccess(backOffer, resp)) { + result.addAll(resp.getPairsList()); + } else { + ok = false; + } + } + } + if (ok) { + return result; + } + } } - // TODO: deal with resolve locks and region errors - private List doBatchGet(BatchGetResponse resp, BackOffer bo) { + private boolean isBatchGetSuccess(BackOffer bo, BatchGetResponse resp) { if (resp == null) { this.regionManager.onRequestFail(region); throw new TiClientInternalException("BatchGetResponse failed without a cause"); @@ -213,11 +236,10 @@ private List doBatchGet(BatchGetResponse resp, BackOffer bo) { if (!ok) { // if not resolve all locks, we wait and retry bo.doBackOff(BoTxnLockFast, new KeyException((resp.getPairsList().get(0).getError()))); + return false; } - - // FIXME: we should retry } - return resp.getPairsList(); + return true; } public List scan( diff --git a/tikv-client/src/test/java/com/pingcap/tikv/MockServerTest.java b/tikv-client/src/test/java/com/pingcap/tikv/MockServerTest.java index c8160aae72..ec1fc2552e 100644 --- a/tikv-client/src/test/java/com/pingcap/tikv/MockServerTest.java +++ b/tikv-client/src/test/java/com/pingcap/tikv/MockServerTest.java @@ -19,7 +19,6 @@ import com.pingcap.tikv.region.TiRegion; import java.io.IOException; import org.junit.Before; -import org.tikv.kvproto.Kvrpcpb; import org.tikv.kvproto.Metapb; import org.tikv.kvproto.Pdpb; @@ -42,7 +41,12 @@ public void setUp() throws IOException { .addPeers(Metapb.Peer.newBuilder().setId(11).setStoreId(13)) .build(); - region = new TiRegion(r, r.getPeers(0), Kvrpcpb.IsolationLevel.RC, Kvrpcpb.CommandPri.Low); + region = + new TiRegion( + r, + r.getPeers(0), + session.getConf().getIsolationLevel(), + session.getConf().getCommandPriority()); pdServer.addGetRegionResp(Pdpb.GetRegionResponse.newBuilder().setRegion(r).build()); server = new KVMockServer(); port = server.start(region); From 102894acb3d5fe8f3ff30e84ea1741e095dc27ec Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Wed, 7 Aug 2019 14:01:32 +0800 Subject: [PATCH 23/62] use tispark self-made m2 cahce file (#990) --- .ci/build.groovy | 2 +- .ci/integration_test.groovy | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci/build.groovy b/.ci/build.groovy index 00b7392e18..cbcfd228be 100644 --- a/.ci/build.groovy +++ b/.ci/build.groovy @@ -13,7 +13,7 @@ def call(ghprbActualCommit, ghprbPullId, ghprbPullTitle, ghprbPullLink, ghprbPul stage('Checkout') { dir("/home/jenkins/git/tispark") { sh """ - archive_url=http://172.16.30.25/download/builds/pingcap/tiflash/cache/tiflash-m2-cache_latest.tar.gz + archive_url=http://fileserver.pingcap.net/download/builds/pingcap/tispark/cache/tispark-m2-cache-latest.tar.gz if [ ! "\$(ls -A /maven/.m2/repository)" ]; then curl -sL \$archive_url | tar -zx -C /maven || true; fi """ if (sh(returnStatus: true, script: '[ -d .git ] && [ -f Makefile ] && git rev-parse --git-dir > /dev/null 2>&1') != 0) { diff --git a/.ci/integration_test.groovy b/.ci/integration_test.groovy index 2fd65a734f..2a0e862570 100644 --- a/.ci/integration_test.groovy +++ b/.ci/integration_test.groovy @@ -136,7 +136,7 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb print run_chunks def mvnStr = get_mvn_str(run_chunks) sh """ - archive_url=http://172.16.30.25/download/builds/pingcap/tiflash/cache/tiflash-m2-cache_latest.tar.gz + archive_url=http://fileserver.pingcap.net/download/builds/pingcap/tispark/cache/tispark-m2-cache-latest.tar.gz if [ ! "\$(ls -A /maven/.m2/repository)" ]; then curl -sL \$archive_url | tar -zx -C /maven || true; fi """ sh """ @@ -150,7 +150,7 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb def run_tikvclient_test = { chunk_suffix -> dir("go/src/github.com/pingcap/tispark") { sh """ - archive_url=http://172.16.30.25/download/builds/pingcap/tiflash/cache/tiflash-m2-cache_latest.tar.gz + archive_url=http://fileserver.pingcap.net/download/builds/pingcap/tispark/cache/tispark-m2-cache-latest.tar.gz if [ ! "\$(ls -A /maven/.m2/repository)" ]; then curl -sL \$archive_url | tar -zx -C /maven || true; fi """ sh """ From b5d339cb409517aa14fee972adab3fef628e52ba Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Wed, 7 Aug 2019 15:16:30 +0800 Subject: [PATCH 24/62] add spark sql document for batch write (#991) --- docs/datasource_api_userguide.md | 48 ++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/docs/datasource_api_userguide.md b/docs/datasource_api_userguide.md index 6444c0556a..0b4ec505d8 100644 --- a/docs/datasource_api_userguide.md +++ b/docs/datasource_api_userguide.md @@ -216,6 +216,54 @@ df.write .save() ``` +## Use Data Source API in SparkSQL +Config tidb/pd address and enable write through SparkSQL in `conf/spark-defaults.conf` as follows: + +``` +spark.tispark.pd.addresses 127.0.0.1:2379 +spark.tispark.tidb.addr 127.0.0.1 +spark.tispark.tidb.port 4000 +spark.tispark.tidb.user root +spark.tispark.tidb.password password +spark.tispark.write.allow_spark_sql true +``` + +create a new table using mysql-client: +``` +CREATE TABLE tpch_test.TARGET_TABLE_CUSTOMER ( + `C_CUSTKEY` int(11) NOT NULL, + `C_NAME` varchar(25) NOT NULL, + `C_ADDRESS` varchar(40) NOT NULL, + `C_NATIONKEY` int(11) NOT NULL, + `C_PHONE` char(15) NOT NULL, + `C_ACCTBAL` decimal(15,2) NOT NULL, + `C_MKTSEGMENT` char(10) NOT NULL, + `C_COMMENT` varchar(117) NOT NULL +) +``` + +register a tidb table `tpch_test.CUSTOMER` to spark catalog: +``` +CREATE TABLE CUSTOMER_SRC USING tidb OPTIONS (database 'tpch_test', table 'CUSTOMER') +``` + +select data from `tpch_test.CUSTOMER`: +``` +SELECT * FROM CUSTOMER_SRC limit 10 +``` + +register another tidb table `tpch_test.TARGET_TABLE_CUSTOMER` to spark catalog: +``` +CREATE TABLE CUSTOMER_DST USING tidb OPTIONS (database 'tpch_test', table 'TARGET_TABLE_CUSTOMER') +``` + +write data to `tpch_test.TARGET_TABLE_CUSTOMER`: +``` +INSERT INTO CUSTOMER_DST VALUES(1000, 'Customer#000001000', 'AnJ5lxtLjioClr2khl9pb8NLxG2', 9, '19-407-425-2584', 2209.81, 'AUTOMOBILE', '. even, express theodolites upo') + +INSERT INTO CUSTOMER_DST SELECT * FROM CUSTOMER_SRC +``` + ## TiDB Options The following is TiDB-specific options, which can be passed in through `TiDBOptions` or `SparkConf`. From aa21adb1389e26830e2359d43b5dbdc1c98209c4 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Wed, 7 Aug 2019 18:03:54 +0800 Subject: [PATCH 25/62] add auto mode for test.data.load (#994) --- core/src/test/Readme.md | 3 +- .../resources/tidb_config.properties.template | 3 +- .../spark/sql/test/SharedSQLContext.scala | 58 ++++++++++++++++--- 3 files changed, 54 insertions(+), 10 deletions(-) diff --git a/core/src/test/Readme.md b/core/src/test/Readme.md index fb18a54bf3..771616bf0d 100644 --- a/core/src/test/Readme.md +++ b/core/src/test/Readme.md @@ -20,7 +20,8 @@ spark.tispark.pd.addresses=127.0.0.1:2379 # Whether to allow index read in tests, you must set this to true to run index tests. spark.tispark.plan.allow_index_read=true # Whether to load test data before running tests. If you haven't load tispark_test or tpch_test data, set this to true. The next time you run tests, you can set this to false. -test.data.load=false +# If you do not want the change this value, please set it to auto, the test data will be loaded only if it does not exist in tidb. +test.data.load=auto # DB prefix for tidb tables in case it conflicts with hive database spark.tispark.db_prefix=tidb_ ``` \ No newline at end of file diff --git a/core/src/test/resources/tidb_config.properties.template b/core/src/test/resources/tidb_config.properties.template index d7d4cfcdd1..2babbfcd7f 100644 --- a/core/src/test/resources/tidb_config.properties.template +++ b/core/src/test/resources/tidb_config.properties.template @@ -17,7 +17,8 @@ # Whether to allow index read in tests, you must set this to true to run index tests. # spark.tispark.plan.allow_index_read=true # Whether to load test data before running tests. If you haven't load tispark_test or tpch_test data, set this to true. The next time you run tests, you can set this to false. -# test.data.load=true +# If you do not want the change this value, please set it to auto, the test data will be loaded only if it does not exist in tidb. +# test.data.load=auto # Whether to generate test data. Enabling test data generation may change data of all tests. # test.data.generate=true # DB prefix for tidb databases in case it conflicts with hive database diff --git a/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala b/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala index 3c86c437c7..8e7deb61f8 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.test import java.io.File -import java.sql.{Connection, Statement} +import java.sql.{Connection, Date, Statement} import java.util.{Locale, Properties, TimeZone} import com.pingcap.tispark.TiConfigConst.PD_ADDRESSES @@ -34,6 +34,8 @@ import org.scalatest.BeforeAndAfterAll import org.scalatest.concurrent.Eventually import org.slf4j.Logger +import scala.collection.mutable.ArrayBuffer + /** * This trait manages basic TiSpark, Spark JDBC, TiDB JDBC * connection resource and relevant configurations. @@ -290,20 +292,60 @@ object SharedSQLContext extends Logging { _statement = _tidbConnection.createStatement() } + private def queryTiDBViaJDBC(query: String): List[List[Any]] = { + val resultSet = tidbStmt.executeQuery(query) + val rsMetaData = resultSet.getMetaData + val retSet = ArrayBuffer.empty[List[Any]] + val retSchema = ArrayBuffer.empty[String] + for (i <- 1 to rsMetaData.getColumnCount) { + retSchema += rsMetaData.getColumnTypeName(i) + } + while (resultSet.next()) { + val row = ArrayBuffer.empty[Any] + + for (i <- 1 to rsMetaData.getColumnCount) { + row += toOutput(resultSet.getObject(i), retSchema(i - 1)) + } + retSet += row.toList + } + retSet.toList + } + + private def toOutput(value: Any, colType: String): Any = value match { + case _: BigDecimal => + value.asInstanceOf[BigDecimal].setScale(2, BigDecimal.RoundingMode.HALF_UP) + case _: Date if colType.equalsIgnoreCase("YEAR") => + value.toString.split("-")(0) + case default => + default + } + + private def shouldLoadData(loadData: String): Boolean = { + if ("true".equals(loadData)) { + true + } else if ("auto".equals(loadData)) { + val databases = queryTiDBViaJDBC("show databases").map(a => a.head) + if (databases.contains("tispark_test") && databases.contains("tpch_test") && databases + .contains("resolveLock_test")) { + false + } else { + true + } + } else { + false + } + } + private def initializeTiDBConnection(forceNotLoad: Boolean = false): Unit = if (_tidbConnection == null) { initializeJDBCUrl() - val loadData = getOrElse(_tidbConf, SHOULD_LOAD_DATA, "true").toLowerCase.toBoolean + val loadData = getOrElse(_tidbConf, SHOULD_LOAD_DATA, "auto").toLowerCase - if (loadData) { - logger.info("load data is enabled") - } else { - logger.info("load data is disabled") - } + logger.info(s"load data is mode: $loadData") - if (loadData && !forceNotLoad) { + if (shouldLoadData(loadData) && !forceNotLoad) { logger.info("Loading TiSparkTestData") // Load index test data loadSQLFile("tispark-test", "IndexTest") From 465911fc53a10cf92e00e7b701e0e3a76e346864 Mon Sep 17 00:00:00 2001 From: birdstorm Date: Thu, 8 Aug 2019 10:51:18 +0800 Subject: [PATCH 26/62] fix typo (#996) --- CHANGELOG.md | 2 +- .../com/pingcap/tispark/BasicExpression.scala | 4 ++-- .../com/pingcap/tispark/TiBatchWrite.scala | 4 ++-- .../com/pingcap/tispark/TiConfigConst.scala | 1 - .../spark/sql/execution/CoprocessorRDD.scala | 14 +++++++------- .../apache/spark/sql/PartitionTableSuite.scala | 2 +- docs/datasource_api_userguide.md | 2 +- docs/userguide.md | 18 +++++++++--------- docs/userguide_spark2.1.md | 8 ++++---- python/README_spark2.1.md | 2 +- tikv-client/README.md | 2 +- .../main/java/com/pingcap/tikv/PDClient.java | 6 +++--- .../com/pingcap/tikv/TwoPhaseCommitter.java | 2 +- .../pingcap/tikv/allocator/RowIDAllocator.java | 5 +---- .../java/com/pingcap/tikv/codec/MyDecimal.java | 6 +++--- .../tikv/expression/ExpressionBlacklist.java | 2 +- .../visitor/ExpressionTypeCoercer.java | 4 ++-- .../visitor/PrunedPartitionBuilder.java | 2 +- .../visitor/SupportedExpressionValidator.java | 2 +- .../java/com/pingcap/tikv/meta/TiViewInfo.java | 2 +- .../tikv/predicates/TiKVScanAnalyzer.java | 8 ++++---- .../pingcap/tikv/region/RegionStoreClient.java | 2 +- .../java/com/pingcap/tikv/region/TiRegion.java | 8 ++++---- .../java/com/pingcap/tikv/types/Converter.java | 5 ++--- 24 files changed, 54 insertions(+), 59 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 98607cc165..67a4910775 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ All notable changes to this project will be documented in this file. ## [TiSpark 2.1.2] 2019-07-29 ### Fixes * Fix improper response with region error [#922](https://github.com/pingcap/tispark/pull/922) -* Fix view parseing problem [#953](https://github.com/pingcap/tispark/pull/953) +* Fix view parsing problem [#953](https://github.com/pingcap/tispark/pull/953) ## [TiSpark 1.2.1] ### Fixes diff --git a/core/src/main/scala/com/pingcap/tispark/BasicExpression.scala b/core/src/main/scala/com/pingcap/tispark/BasicExpression.scala index ff0cb5c3e2..3b45c2e501 100644 --- a/core/src/main/scala/com/pingcap/tispark/BasicExpression.scala +++ b/core/src/main/scala/com/pingcap/tispark/BasicExpression.scala @@ -136,8 +136,8 @@ object BasicExpression { // Some(TiExpr.create().setValue(attr.name).toProto) Some(ColumnRef.create(attr.name)) - case uattr: UnresolvedAttribute => - Some(ColumnRef.create(uattr.name)) + case uAttr: UnresolvedAttribute => + Some(ColumnRef.create(uAttr.name)) // TODO: Remove it and let it fail once done all translation case _ => Option.empty[TiExpression] diff --git a/core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala b/core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala index 5a2b737cc2..7b0921dc37 100644 --- a/core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala +++ b/core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala @@ -748,7 +748,7 @@ class TiBatchWrite(@transient val df: DataFrame, estimateRegionSplitNumForIndex(wrappedRowRdd, index) } } - val sampledDataRdds = + val sampledDataRDDs = regionSplitNums.map { num => wrappedRowRdd.takeSample(withReplacement = false, num = num.toInt) }.toList @@ -758,7 +758,7 @@ class TiBatchWrite(@transient val df: DataFrame, val idx = indexWithIdx._2 val indexCols = index.getIndexColumns - val splitIndicesList = sampledDataRdds(idx) + val splitIndicesList = sampledDataRDDs(idx) .map { value => val colBuffer = mutable.ListBuffer.empty[String] for (i <- 0 until indexCols.size()) { diff --git a/core/src/main/scala/com/pingcap/tispark/TiConfigConst.scala b/core/src/main/scala/com/pingcap/tispark/TiConfigConst.scala index f2bb9ebd48..810618dadb 100644 --- a/core/src/main/scala/com/pingcap/tispark/TiConfigConst.scala +++ b/core/src/main/scala/com/pingcap/tispark/TiConfigConst.scala @@ -25,7 +25,6 @@ object TiConfigConst { val INDEX_SCAN_CONCURRENCY: String = "spark.tispark.index.scan_concurrency" val TABLE_SCAN_CONCURRENCY: String = "spark.tispark.table.scan_concurrency" val ALLOW_AGG_PUSHDOWN: String = "spark.tispark.plan.allow_agg_pushdown" - val PD_CLUSTER_ID: String = "spark.tispark.pd.clusterid" val REQUEST_COMMAND_PRIORITY: String = "spark.tispark.request.command.priority" val REQUEST_ISOLATION_LEVEL: String = "spark.tispark.request.isolation.level" val ALLOW_INDEX_READ: String = "spark.tispark.plan.allow_index_read" diff --git a/core/src/main/scala/org/apache/spark/sql/execution/CoprocessorRDD.scala b/core/src/main/scala/org/apache/spark/sql/execution/CoprocessorRDD.scala index bfb6cd8037..e5074084b0 100644 --- a/core/src/main/scala/org/apache/spark/sql/execution/CoprocessorRDD.scala +++ b/core/src/main/scala/org/apache/spark/sql/execution/CoprocessorRDD.scala @@ -44,7 +44,7 @@ import org.tikv.kvproto.Coprocessor.KeyRange import scala.collection.JavaConversions._ import scala.collection.mutable -case class CoprocessorRDD(output: Seq[Attribute], tiRdds: List[TiRDD]) extends LeafExecNode { +case class CoprocessorRDD(output: Seq[Attribute], tiRDDs: List[TiRDD]) extends LeafExecNode { override lazy val metrics: Map[String, SQLMetric] = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows") @@ -55,7 +55,7 @@ case class CoprocessorRDD(output: Seq[Attribute], tiRdds: List[TiRDD]) extends L override val outputOrdering: Seq[SortOrder] = Nil private val internalRDDs: List[RDD[InternalRow]] = - tiRdds.map(rdd => RDDConversions.rowToRowRdd(rdd, output.map(_.dataType))) + tiRDDs.map(rdd => RDDConversions.rowToRowRdd(rdd, output.map(_.dataType))) private lazy val project = UnsafeProjection.create(schema) private def internalRowToUnsafeRowWithIndex( @@ -84,13 +84,13 @@ case class CoprocessorRDD(output: Seq[Attribute], tiRdds: List[TiRDD]) extends L } - def dagRequest: TiDAGRequest = tiRdds.head.dagRequest + def dagRequest: TiDAGRequest = tiRDDs.head.dagRequest override def verboseString: String = - if (tiRdds.size > 1) { + if (tiRDDs.size > 1) { val b = new StringBuilder b.append(s"TiSpark $nodeName on partition table:\n") - tiRdds.zipWithIndex.map { + tiRDDs.zipWithIndex.map { case (_, i) => b.append(s"partition p$i") } b.append(s"with dag request: $dagRequest") @@ -395,7 +395,7 @@ case class RegionTaskExec(child: SparkPlan, } } - val schemaInferer: SchemaInfer = if (satisfyDowngradeThreshold) { + val schemaInferrer: SchemaInfer = if (satisfyDowngradeThreshold) { // Should downgrade to full table scan for one region logger.info( s"Index scan task range size = ${indexTaskRanges.size}, " + @@ -410,7 +410,7 @@ case class RegionTaskExec(child: SparkPlan, SchemaInfer.create(dagRequest) } - val rowTransformer: RowTransformer = schemaInferer.getRowTransformer + val rowTransformer: RowTransformer = schemaInferrer.getRowTransformer val outputTypes = output.map(_.dataType) val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter) diff --git a/core/src/test/scala/org/apache/spark/sql/PartitionTableSuite.scala b/core/src/test/scala/org/apache/spark/sql/PartitionTableSuite.scala index 4d28e2b13b..6d7a1bea8d 100644 --- a/core/src/test/scala/org/apache/spark/sql/PartitionTableSuite.scala +++ b/core/src/test/scala/org/apache/spark/sql/PartitionTableSuite.scala @@ -132,7 +132,7 @@ class PartitionTableSuite extends BaseTiSparkTest { if (copRDD.isDefined) { copRDD.get .asInstanceOf[CoprocessorRDD] - .tiRdds(0) + .tiRDDs(0) .dagRequest } else { regionTaskExec.get diff --git a/docs/datasource_api_userguide.md b/docs/datasource_api_userguide.md index 0b4ec505d8..3515a506c6 100644 --- a/docs/datasource_api_userguide.md +++ b/docs/datasource_api_userguide.md @@ -33,7 +33,7 @@ If `replace` is true, then * if no same primary key or unique index exists, data will be inserted. If `replace` is false, then -* if primary key or unique index exists in db, data having conflicts expects an expection. +* if primary key or unique index exists in db, data having conflicts expects an exception. * if no same primary key or unique index exists, data will be inserted. ## Using the Spark Connector With Extensions Enabled diff --git a/docs/userguide.md b/docs/userguide.md index 4ce2f7f241..8d0b7bd097 100644 --- a/docs/userguide.md +++ b/docs/userguide.md @@ -39,7 +39,7 @@ For independent deployment of TiKV and TiSpark, it is recommended to refer to th + Hardware configuration - For general purposes, please refer to the TiDB and TiKV hardware configuration [recommendations](https://github.com/pingcap/docs/blob/master/op-guide/recommendation.md#deployment-recommendations). - - If the usage is more focused on the analysis scenarios, you can increase the memory of the TiKV nodes to at least 64G. If using Hard Disk Drive (HDD), it is recommended to use at least 8 disks. + - If the usage is more focused on the analysis scenarios, you can increase the memory of the TiKV nodes to at least 64G. If using Hard Disk Drive (HDD), it is recommended to use at least 8 disks. + TiKV parameters (default) @@ -77,7 +77,7 @@ Please refer to the [Spark official website](https://spark.apache.org/docs/lates The following is a short overview of the TiSpark configuration. -Generally, it is recommended to allocat 32G memory for Spark. Please reserve at least 25% of the memory for the operating system and buffer cache. +Generally, it is recommended to allocate 32G memory for Spark. Please reserve at least 25% of the memory for the operating system and buffer cache. It is recommended to provision at least 8 to 16 cores on per machine for Spark. Initially, you can assign all the CPU cores to Spark. @@ -100,7 +100,7 @@ For example, `10.16.20.1:2379,10.16.20.2:2379,10.16.20.3:2379` when you have mul #### Hybrid deployment configuration for the TiSpark and TiKV cluster -For the hybrid deployment of TiSpark and TiKV, add the TiSpark required resources to the TiKV reserved resources, and allocate 25% of the memory for the system. +For the hybrid deployment of TiSpark and TiKV, add the TiSpark required resources to the TiKV reserved resources, and allocate 25% of the memory for the system. ## Deploy TiSpark @@ -121,7 +121,7 @@ If you want to deploy TiSpark as a default component, simply place the TiSpark j cp $your_path_to/tispark-${name_with_version}.jar $SPARK_HOME/jars ``` -In this way, you can use either `Spark-Submit` or `Spark-Shell` to use TiSpark directly. +In this way, you can use either `Spark-Submit` or `Spark-Shell` to use TiSpark directly. ### Deploy TiSpark without the Spark cluster @@ -152,7 +152,7 @@ cd $SPARKPATH ./sbin/start-master.sh ``` -After the above step is completed, a log file will be printed on the screen. Check the log file to confirm whether the Spark-Master is started successfully. You can open the [http://spark-master-hostname:8080](http://spark-master-hostname:8080) to view the cluster information (if you does not change the Spark-Master default port number). When you start Spark-Slave, you can also use this panel to confirm whether the Slave is joined to the cluster. +After the above step is completed, a log file will be printed on the screen. Check the log file to confirm whether the Spark-Master is started successfully. You can open the [http://spark-master-hostname:8080](http://spark-master-hostname:8080) to view the cluster information (if you does not change the Spark-Master default port number). When you start Spark-Slave, you can also use this panel to confirm whether the Slave is joined to the cluster. #### Starting a Slave node @@ -212,7 +212,7 @@ spark-sql> select count(*) from lineitem; Time taken: 0.673 seconds, Fetched 1 row(s) ``` -For JDBC connection with Thrift Server, you can try it with various JDBC supported tools including SQuirreLSQL and hive-beeline. +For JDBC connection with Thrift Server, you can try it with various JDBC supported tools including SQuirreL SQL and hive-beeline. For example, to use it with beeline: ``` ./beeline @@ -252,8 +252,8 @@ tisparkDF.write.saveAsTable("hive_table") // save table to hive spark.sql("select * from hive_table a, tispark_table b where a.col1 = b.col1").show // join table across Hive and Tispark ``` -## Load Spark Dataframe into TiDB using JDBC -TiSpark does not provide a direct way to load data into yout TiDB cluster, but you can still load using jdbc like this: +## Load Spark DataFrame into TiDB using JDBC +TiSpark does not provide a direct way to load data into your TiDB cluster, but you can still load using jdbc like this: ```scala import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions @@ -299,7 +299,7 @@ Currently, only range partition table is limited supported. If partition express rather than `year` then partition pruning will not be applied. Such scan can be considered full table scan if there is no index in the schema. ## Common Port numbers used by Spark Cluster -|Port Name| Default Value Port Number | Configuration Property | Nots| +|Port Name| Default Value Port Number | Configuration Property | Notes| |---------------| ------------- |-----|-----| |Master web UI | 8080 | spark.master.ui.port or SPARK_MASTER_WEBUI_PORT| The value set by the spark.master.ui.port property takes precedence. | |Worker web UI | 8081| spark.worker.ui.port or SPARK_WORKER_WEBUI_PORT | The value set by the spark.worker.ui.port takes precedence.| diff --git a/docs/userguide_spark2.1.md b/docs/userguide_spark2.1.md index 79d6ec3241..9be6de4b72 100644 --- a/docs/userguide_spark2.1.md +++ b/docs/userguide_spark2.1.md @@ -77,7 +77,7 @@ Please refer to the [Spark official website](https://spark.apache.org/docs/lates The following is a short overview of the TiSpark configuration. -Generally, it is recommended to allocat 32G memory for Spark. Please reserve at least 25% of the memory for the operating system and buffer cache. +Generally, it is recommended to allocate 32G memory for Spark. Please reserve at least 25% of the memory for the operating system and buffer cache. It is recommended to provision at least 8 to 16 cores on per machine for Spark. Initially, you can assign all the CPU cores to Spark. @@ -91,7 +91,7 @@ SPARK_WORKER_CORES = 8 #### Hybrid deployment configuration for the TiSpark and TiKV cluster -For the hybrid deployment of TiSpark and TiKV, add the TiSpark required resources to the TiKV reserved resources, and allocate 25% of the memory for the system. +For the hybrid deployment of TiSpark and TiKV, add the TiSpark required resources to the TiKV reserved resources, and allocate 25% of the memory for the system. ## Deploy TiSpark @@ -113,7 +113,7 @@ $ {SPARK_INSTALL_PATH} / jars ``` -In this way, you can use either `Spark-Submit` or `Spark-Shell` to use TiSpark directly. +In this way, you can use either `Spark-Submit` or `Spark-Shell` to use TiSpark directly. ### Deploy TiSpark without the Spark cluster @@ -140,7 +140,7 @@ cd $ SPARKPATH ./sbin/start-master.sh ``` -After the above step is completed, a log file will be printed on the screen. Check the log file to confirm whether the Spark-Master is started successfully. You can open the [http://spark-master-hostname:8080](http://spark-master-hostname:8080) to view the cluster information (if you does not change the Spark-Master default port number). When you start Spark-Slave, you can also use this panel to confirm whether the Slave is joined to the cluster. +After the above step is completed, a log file will be printed on the screen. Check the log file to confirm whether the Spark-Master is started successfully. You can open the [http://spark-master-hostname:8080](http://spark-master-hostname:8080) to view the cluster information (if you does not change the Spark-Master default port number). When you start Spark-Slave, you can also use this panel to confirm whether the Slave is joined to the cluster. #### Starting a Slave node diff --git a/python/README_spark2.1.md b/python/README_spark2.1.md index 2ec73823a3..9ae84624c7 100644 --- a/python/README_spark2.1.md +++ b/python/README_spark2.1.md @@ -17,7 +17,7 @@ This is the simplest way, just a decent Spark environment should be enough. from py4j.java_gateway import java_import from pyspark.context import SparkContext -# We get a referenct to py4j Java Gateway +# We get a reference to py4j Java Gateway gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.sql.TiContext") diff --git a/tikv-client/README.md b/tikv-client/README.md index 1ecda8e341..6472b143f8 100644 --- a/tikv-client/README.md +++ b/tikv-client/README.md @@ -5,7 +5,7 @@ It is supposed to: + Communicate via [gRPC](http://www.grpc.io/) + Talk to Placement Driver searching for a region + Talk to TiKV for reading/writing data and the resulted data is encoded/decoded just like what we do in TiDB. -+ Talk to Coprocessor for calculation pushdown ++ Talk to Coprocessor for calculation push down ## How to build diff --git a/tikv-client/src/main/java/com/pingcap/tikv/PDClient.java b/tikv-client/src/main/java/com/pingcap/tikv/PDClient.java index 7c69f12c38..cfdb72aaf9 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/PDClient.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/PDClient.java @@ -163,7 +163,7 @@ public Future getRegionByIDAsync(BackOffer backOffer, long id) { return responseObserver.getFuture(); } - private Supplier buildGetStroeReq(long storeId) { + private Supplier buildGetStoreReq(long storeId) { return () -> GetStoreRequest.newBuilder().setHeader(header).setStoreId(storeId).build(); } @@ -175,7 +175,7 @@ private PDErrorHandler buildPDErrorHandler() { @Override public Store getStore(BackOffer backOffer, long storeId) { return callWithRetry( - backOffer, PDGrpc.METHOD_GET_STORE, buildGetStroeReq(storeId), buildPDErrorHandler()) + backOffer, PDGrpc.METHOD_GET_STORE, buildGetStoreReq(storeId), buildPDErrorHandler()) .getStore(); } @@ -187,7 +187,7 @@ public Future getStoreAsync(BackOffer backOffer, long storeId) { callAsyncWithRetry( backOffer, PDGrpc.METHOD_GET_STORE, - buildGetStroeReq(storeId), + buildGetStoreReq(storeId), responseObserver, buildPDErrorHandler()); return responseObserver.getFuture(); diff --git a/tikv-client/src/main/java/com/pingcap/tikv/TwoPhaseCommitter.java b/tikv-client/src/main/java/com/pingcap/tikv/TwoPhaseCommitter.java index 021a7dcdd4..b6fe4ed224 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/TwoPhaseCommitter.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/TwoPhaseCommitter.java @@ -329,7 +329,7 @@ private void doPrewriteSecondaryKeysInBatchesWithRetry( } LOG.debug( String.format( - "oldRegion=%s != currentRegion=%s, will refetch region info and retry", + "oldRegion=%s != currentRegion=%s, will re-fetch region info and retry", oldRegion, currentRegion)); retryPrewriteBatch(backOffer, primaryKey, batchKeys, mutations, level <= 0 ? 1 : level + 1); } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/allocator/RowIDAllocator.java b/tikv-client/src/main/java/com/pingcap/tikv/allocator/RowIDAllocator.java index 04097698c5..f681eea220 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/allocator/RowIDAllocator.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/allocator/RowIDAllocator.java @@ -136,10 +136,7 @@ private long updateHash( private boolean isDBExisted(long dbId, Snapshot snapshot) { ByteString dbKey = MetaCodec.encodeDatabaseID(dbId); ByteString json = MetaCodec.hashGet(MetaCodec.KEY_DBs, dbKey, snapshot); - if (json == null || json.isEmpty()) { - return false; - } - return true; + return json != null && !json.isEmpty(); } private boolean isTableExisted(long dbId, long tableId, Snapshot snapshot) { diff --git a/tikv-client/src/main/java/com/pingcap/tikv/codec/MyDecimal.java b/tikv-client/src/main/java/com/pingcap/tikv/codec/MyDecimal.java index cacd743b75..69cbd5455a 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/codec/MyDecimal.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/codec/MyDecimal.java @@ -240,7 +240,7 @@ private int[] removeLeadingZeros() { } /** - * Counts the number of digits of prefix zeors. For 00.001, it reutrns two. + * Counts the number of digits of prefix zeroes. For 00.001, it returns two. * * @param i i is index for getting powers10. * @param word word is a integer. @@ -598,10 +598,10 @@ private int decimalBinSize(int precision, int frac) { * *

This binary format is as follows: 1. First the number is converted to have a requested * precision and frac. 2. Every full digitsPerWord digits of digitsInt part are stored in 4 bytes - * as is 3. The first digitsInt % digitesPerWord digits are stored in the reduced number of bytes + * as is 3. The first digitsInt % digitsPerWord digits are stored in the reduced number of bytes * (enough bytes to store this number of digits - see dig2bytes) 4. same for frac - full word are * stored as is, the last frac % digitsPerWord digits - in the reduced number of bytes. 5. If the - * number is negative - every byte is inversed. 5. The very first bit of the resulting byte array + * number is negative - every byte is inverted. 5. The very first bit of the resulting byte array * is inverted (because memcmp compares unsigned bytes, see property 2 above) * *

Example: diff --git a/tikv-client/src/main/java/com/pingcap/tikv/expression/ExpressionBlacklist.java b/tikv-client/src/main/java/com/pingcap/tikv/expression/ExpressionBlacklist.java index e532de33dc..015f91c1df 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/expression/ExpressionBlacklist.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/expression/ExpressionBlacklist.java @@ -21,7 +21,7 @@ public ExpressionBlacklist(String exprsString) { super(exprsString); } - public boolean isUnsupportedPushdownExpr(Class cls) { + public boolean isUnsupportedPushDownExpr(Class cls) { return isUnsupported(cls); } } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/expression/visitor/ExpressionTypeCoercer.java b/tikv-client/src/main/java/com/pingcap/tikv/expression/visitor/ExpressionTypeCoercer.java index 948516dc84..a4272b6b91 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/expression/visitor/ExpressionTypeCoercer.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/expression/visitor/ExpressionTypeCoercer.java @@ -40,7 +40,7 @@ public class ExpressionTypeCoercer extends Visitor, DataT private static final double CONSTANT_CRED = MIN_CREDIBILITY; private static final double LOGICAL_OP_CRED = MAX_CREDIBILITY; private static final double COMPARISON_OP_CRED = MAX_CREDIBILITY; - private static final double SRING_REG_OP_CRED = MAX_CREDIBILITY; + private static final double STRING_REG_OP_CRED = MAX_CREDIBILITY; private static final double FUNCTION_CRED = MAX_CREDIBILITY; private static final double ISNULL_CRED = MAX_CREDIBILITY; private static final double NOT_CRED = MAX_CREDIBILITY; @@ -126,7 +126,7 @@ protected Pair visit(StringRegExpression node, DataType target coerceType(null, node.getLeft(), node.getRight()); typeMap.put(node, IntegerType.BOOLEAN); } - return Pair.create(IntegerType.BOOLEAN, SRING_REG_OP_CRED); + return Pair.create(IntegerType.BOOLEAN, STRING_REG_OP_CRED); } @Override diff --git a/tikv-client/src/main/java/com/pingcap/tikv/expression/visitor/PrunedPartitionBuilder.java b/tikv-client/src/main/java/com/pingcap/tikv/expression/visitor/PrunedPartitionBuilder.java index 870afd8058..7ae6251410 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/expression/visitor/PrunedPartitionBuilder.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/expression/visitor/PrunedPartitionBuilder.java @@ -35,7 +35,7 @@ /** * Apply partition pruning rule on filter condition. Partition pruning is based on a simple idea and * can be described as "Do not scan partitions where there can be no matching values". Currently - * only range partition pruning is supported(range column on mutiple columns is not supported at + * only range partition pruning is supported(range column on multiple columns is not supported at * TiDB side, so we can't optimize this yet). */ public class PrunedPartitionBuilder extends RangeSetBuilder { diff --git a/tikv-client/src/main/java/com/pingcap/tikv/expression/visitor/SupportedExpressionValidator.java b/tikv-client/src/main/java/com/pingcap/tikv/expression/visitor/SupportedExpressionValidator.java index 1562f8bfd0..f1ebf10606 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/expression/visitor/SupportedExpressionValidator.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/expression/visitor/SupportedExpressionValidator.java @@ -40,7 +40,7 @@ public static boolean isSupportedExpression(Expression node, ExpressionBlacklist @Override protected Boolean process(Expression node, ExpressionBlacklist blacklist) { - if (blacklist != null && blacklist.isUnsupportedPushdownExpr(getClass())) { + if (blacklist != null && blacklist.isUnsupportedPushDownExpr(getClass())) { return false; } for (Expression expr : node.getChildren()) { diff --git a/tikv-client/src/main/java/com/pingcap/tikv/meta/TiViewInfo.java b/tikv-client/src/main/java/com/pingcap/tikv/meta/TiViewInfo.java index d0aaad4079..5bbef7c89a 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/meta/TiViewInfo.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/meta/TiViewInfo.java @@ -7,7 +7,7 @@ import java.util.stream.Collectors; public class TiViewInfo implements Serializable { - // ViewAlgorithm is VIEW's SQL AlGORITHM characteristic. + // ViewAlgorithm is VIEW's SQL ALGORITHM characteristic. // See https://dev.mysql.com/doc/refman/5.7/en/view-algorithms.html private final long viewAlgorithm; private final TiUserIdentity userIdentity; diff --git a/tikv-client/src/main/java/com/pingcap/tikv/predicates/TiKVScanAnalyzer.java b/tikv-client/src/main/java/com/pingcap/tikv/predicates/TiKVScanAnalyzer.java index 820b79ab30..cf5e8ba680 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/predicates/TiKVScanAnalyzer.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/predicates/TiKVScanAnalyzer.java @@ -265,7 +265,7 @@ TiKVScanPlan buildIndexScan( TiKVScanPlan.Builder planBuilder = TiKVScanPlan.Builder.newBuilder(); ScanSpec result = extractConditions(conditions, table, index); - // this is calcuated for downgrade if there is no statstics info we can + // this is calculated for downgrade if there is no statistics info we can // retrieve from TiKV. double cost = SelectivityCalculator.calcPseudoSelectivity(result); planBuilder.setCost(cost); @@ -387,7 +387,7 @@ Map> buildTableScanKeyRange( @VisibleForTesting private Map> buildIndexScanKeyRangeWithIds( List ids, TiIndexInfo index, List indexRanges) { - Map> idRanes = new HashMap<>(); + Map> idRanges = new HashMap<>(); for (long id : ids) { List ranges = new ArrayList<>(indexRanges.size()); for (IndexRange ir : indexRanges) { @@ -396,9 +396,9 @@ private Map> buildIndexScanKeyRangeWithIds( ranges.add(indexScanKeyRangeBuilder.compute()); } - idRanes.put(id, ranges); + idRanges.put(id, ranges); } - return idRanes; + return idRanges; } @VisibleForTesting diff --git a/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java b/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java index bad2e7bcd1..272482fec8 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java @@ -633,7 +633,7 @@ public Iterator coprocessStreaming(DAGRequest req, List Date: Thu, 8 Aug 2019 16:20:57 +0800 Subject: [PATCH 27/62] fix index scan bug (#995) --- .../java/com/pingcap/tikv/meta/TiDAGRequest.java | 2 -- .../tikv/operation/iterator/CoprocessIterator.java | 13 ++----------- .../tikv/operation/iterator/DAGIteratorTest.java | 12 ++++++++---- 3 files changed, 10 insertions(+), 17 deletions(-) diff --git a/tikv-client/src/main/java/com/pingcap/tikv/meta/TiDAGRequest.java b/tikv-client/src/main/java/com/pingcap/tikv/meta/TiDAGRequest.java index cc60c9e7fc..31f0a777fd 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/meta/TiDAGRequest.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/meta/TiDAGRequest.java @@ -358,9 +358,7 @@ private DAGRequest.Builder buildScan(boolean buildIndexScan) { if (pos != null) { TiColumnInfo columnInfo = columnInfoList.get(indexColOffsets.get(pos)); if (col.getColumnInfo().equals(columnInfo)) { - dagRequestBuilder.addOutputOffsets(pos); colOffsetInFieldMap.put(col, pos); - addRequiredIndexDataType(col.getType()); } // TODO: primary key may also be considered if pkIsHandle } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/operation/iterator/CoprocessIterator.java b/tikv-client/src/main/java/com/pingcap/tikv/operation/iterator/CoprocessIterator.java index ed88068d31..20f0dd728c 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/operation/iterator/CoprocessIterator.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/operation/iterator/CoprocessIterator.java @@ -30,7 +30,6 @@ import com.pingcap.tikv.util.RangeSplitter.RegionTask; import java.util.Iterator; import java.util.List; -import java.util.NoSuchElementException; public abstract class CoprocessIterator implements Iterator { protected final TiSession session; @@ -80,11 +79,7 @@ public static CoprocessIterator getRowIterator( dagRequest.getPushDownType()) { @Override public Row next() { - if (hasNext()) { - return rowReader.readRow(schemaInfer.getTypes().toArray(new DataType[0])); - } else { - throw new NoSuchElementException(); - } + return rowReader.readRow(schemaInfer.getTypes().toArray(new DataType[0])); } }; } @@ -109,11 +104,7 @@ public static CoprocessIterator getHandleIterator( req.getPushDownType()) { @Override public Long next() { - if (hasNext()) { - return rowReader.readRow(handleTypes).getLong(handleTypes.length - 1); - } else { - throw new NoSuchElementException(); - } + return rowReader.readRow(handleTypes).getLong(handleTypes.length - 1); } }; } diff --git a/tikv-client/src/test/java/com/pingcap/tikv/operation/iterator/DAGIteratorTest.java b/tikv-client/src/test/java/com/pingcap/tikv/operation/iterator/DAGIteratorTest.java index 673fe92b54..2e670ba75a 100644 --- a/tikv-client/src/test/java/com/pingcap/tikv/operation/iterator/DAGIteratorTest.java +++ b/tikv-client/src/test/java/com/pingcap/tikv/operation/iterator/DAGIteratorTest.java @@ -83,10 +83,14 @@ public void staleEpochTest() { server.put("key1", cdo.toByteString()); List tasks = ImmutableList.of(RegionTask.newInstance(region, store, keyRanges)); CoprocessIterator iter = CoprocessIterator.getRowIterator(req, tasks, session); - Row r = iter.next(); - SchemaInfer infer = SchemaInfer.create(req); - assertEquals(r.get(0, infer.getType(0)), 666L); - assertEquals(r.get(1, infer.getType(1)), "value1"); + if (!iter.hasNext()) { + assertEquals("iterator has next should be true", true, false); + } else { + Row r = iter.next(); + SchemaInfer infer = SchemaInfer.create(req); + assertEquals(r.get(0, infer.getType(0)), 666L); + assertEquals(r.get(1, infer.getType(1)), "value1"); + } } private static KeyRange createByteStringRange(ByteString sKey, ByteString eKey) { From db2e53a056663a400ed2dd68df2e349e791baff2 Mon Sep 17 00:00:00 2001 From: Zhexuan Yang Date: Thu, 8 Aug 2019 19:20:40 +0800 Subject: [PATCH 28/62] refine doc (#1003) --- docs/userguide.md | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/docs/userguide.md b/docs/userguide.md index 8d0b7bd097..43492fe3f8 100644 --- a/docs/userguide.md +++ b/docs/userguide.md @@ -295,8 +295,19 @@ Currently you could adjust these configs in your spark.conf file. | spark.tispark.statistics.auto_load | true | Whether to load statistics info automatically during database mapping. | ## Reading partition table from TiDB -Currently, only range partition table is limited supported. If partition expression having function expression -rather than `year` then partition pruning will not be applied. Such scan can be considered full table scan if there is no index in the schema. +TiSpark can read range and hash partition table from TiDB. + +TiSpark decides whether to apply partition pruning according to the type of partition and the partition expression associated with the table. + +Currently, TiSpark can partially apply partition pruning on range partition. + +The partition pruning can be applied when the partition expression of the range partition is one of the following: +* column expression +* year(expr) where expr is a column and its type is datetime or string literal +but can be parsed as datetime. + +If partition pruning cannot be applied, it is equivalent to doing a table scan over all partitions. + ## Common Port numbers used by Spark Cluster |Port Name| Default Value Port Number | Configuration Property | Notes| From 827d96ff1c81704990b52a42e25447d2aecafc31 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Sat, 10 Aug 2019 07:16:47 +0800 Subject: [PATCH 29/62] add tidb-3.0 compatibility document (#998) * add tidb-3.0 compatibility document * address code review * address code review --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index f4e7b485ef..115ebcb7f6 100755 --- a/README.md +++ b/README.md @@ -166,6 +166,19 @@ Time Zone can be set by using `-Duser.timezone` system property, e.g. `-Duser.ti ## Statistics information If you want to know how TiSpark could benefit from TiDB's statistic information, read more [here](./docs/userguide.md). +## Compatibility with tidb-3.0 +### View +TiDB starts to support `view` from `tidb-3.0`. + +TiSpark currently **does not support** `view`. Users will not be able to observe or access data through views with TiSpark. + +### Table Partition +`tidb-3.0` supports both `Range Partition` and `Hash Partition`. + +TiSpark currently **supports** `Range Partition` and `Hash Partition`. Users can select data from `Range Partition` table and `Hash Partition` table through TiSpark. + +In most cases TiSpark will use full table scan. Only in some cases TiSpark will apply partition pruning (read more [here](./docs/userguide.md). + ## How to test We use [docker-compose](https://docs.docker.com/compose/) to provide tidb cluster service which allows you to run test across different platforms. It is recommended to install docker in order to test locally, or you can set up your own TiDB cluster locally as you wish. From db2a2d69b638aeca7e3583fb27248d52b307f147 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Mon, 12 Aug 2019 14:18:35 +0800 Subject: [PATCH 30/62] add log4j config document (#1008) --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 115ebcb7f6..a09896c122 100755 --- a/README.md +++ b/README.md @@ -160,6 +160,19 @@ Below configurations can be put together with spark-defaults.conf or passed in t | spark.tispark.db_prefix | "" | A string indicating the extra database prefix for all databases in TiDB to distinguish them from Hive databases with the same name | | spark.tispark.request.isolation.level | "SI" | Isolation level means whether do the resolve lock for the underlying tidb clusters. When you use the "RC", you will get the newest version of record smaller than your tso and ignore the locks. And if you use "SI", you will resolve the locks and get the records according whether resolved lock is committed or aborted | +## Log4j Configuration +When you start `spark-shell` or `spark-sql` and run `show databases`, you might see the following warnings: +``` +Failed to get database default, returning NoSuchObjectException +Failed to get database global_temp, returning NoSuchObjectException +``` + +This is due to spark trying to load two nonexistent databases (`default` and `global_temp`) in its catalog. In order to mute these warnings, please append the following text to `${SPARK_HOME}/conf/log4j.properties`. +``` +# tispark disable "WARN ObjectStore:568 - Failed to get database" +log4j.logger.org.apache.hadoop.hive.metastore.ObjectStore=ERROR +``` + ## Time Zone Time Zone can be set by using `-Duser.timezone` system property, e.g. `-Duser.timezone=GMT-7`, which will affect `Timestamp` type. Please do not use `spark.sql.session.timeZone`. From 930795129f241349d2203427f7b3952aeda2147f Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Tue, 13 Aug 2019 12:25:48 +0800 Subject: [PATCH 31/62] refactor batch write region pre-split (#999) --- .ci/integration_test.groovy | 2 +- .../com/pingcap/tispark/TiBatchWrite.scala | 275 +++++++++++------- .../com/pingcap/tispark/TiConfigConst.scala | 1 + .../com/pingcap/tispark/TiDBOptions.scala | 2 - .../scala/com/pingcap/tispark/TiDBUtils.scala | 2 +- .../com/pingcap/tispark/TiDBWriter.scala | 4 +- .../com/pingcap/tispark/utils/TiUtil.scala | 4 + .../pingcap/tispark/TiBatchWriteSuite.scala | 22 +- .../datasource/BaseDataSourceTest.scala | 2 +- .../tispark/datasource/InsertSuite.scala | 2 +- .../tispark/datasource/RegionSplitSuite.scala | 4 +- .../com/pingcap/tikv/TiConfiguration.java | 10 + .../java/com/pingcap/tikv/TiDBJDBCClient.java | 28 +- 13 files changed, 233 insertions(+), 125 deletions(-) diff --git a/.ci/integration_test.groovy b/.ci/integration_test.groovy index 2a0e862570..c0f83f3da3 100644 --- a/.ci/integration_test.groovy +++ b/.ci/integration_test.groovy @@ -188,7 +188,7 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb sleep 60 """ - timeout(60) { + timeout(120) { run_test(chunk_suffix) } } catch (err) { diff --git a/core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala b/core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala index 7b0921dc37..c364571eeb 100644 --- a/core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala +++ b/core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala @@ -258,30 +258,47 @@ class TiBatchWrite(@transient val df: DataFrame, val distinctWrappedRowRdd = deduplicate(wrappedRowRdd) - splitTableRegion(distinctWrappedRowRdd) - splitIndexRegion(distinctWrappedRowRdd) - val deletion = generateDataToBeRemovedRdd(distinctWrappedRowRdd, startTimeStamp) if (!options.replace && !deletion.isEmpty()) { throw new TiBatchWriteException("data to be inserted has conflicts with TiKV data") } - val mergedRDD = generateKV(distinctWrappedRowRdd, remove = false) ++ generateKV( - deletion, - remove = true - ) - mergedRDD.groupByKey().map { - case (key, iterable) => - // if rdd contains same key, it means we need first delete the old value and insert the new value associated the - // key. We can merge the two operation into one update operation. - // Note: the deletion operation's value of kv pair is empty. - val valueOpt = iterable.find(value => value.nonEmpty) - if (valueOpt.isDefined) { - (key, valueOpt.get) - } else { - (key, new Array[Byte](0)) - } - } + val wrappedEncodedRdd = generateKV(distinctWrappedRowRdd, remove = false) + splitTableRegion(wrappedEncodedRdd.filter(r => !r.isIndex)) + splitIndexRegion(wrappedEncodedRdd.filter(r => r.isIndex)) + + val mergedRDD = wrappedEncodedRdd ++ generateKV(deletion, remove = true) + mergedRDD + .map(wrappedEncodedRow => (wrappedEncodedRow.encodedKey, wrappedEncodedRow)) + .groupByKey() + .map { + case (key, iterable) => + // if rdd contains same key, it means we need first delete the old value and insert the new value associated the + // key. We can merge the two operation into one update operation. + // Note: the deletion operation's value of kv pair is empty. + iterable.find(value => value.encodedValue.nonEmpty) match { + case Some(wrappedEncodedRow) => + WrappedEncodedRow( + wrappedEncodedRow.row, + wrappedEncodedRow.handle, + wrappedEncodedRow.encodedKey, + wrappedEncodedRow.encodedValue, + isIndex = wrappedEncodedRow.isIndex, + wrappedEncodedRow.indexId, + remove = false + ) + case None => + WrappedEncodedRow( + iterable.head.row, + iterable.head.handle, + key, + new Array[Byte](0), + isIndex = iterable.head.isIndex, + iterable.head.indexId, + remove = true + ) + } + } } else { val start = getAutoTableIdStart(tiRowRdd.count) @@ -289,8 +306,11 @@ class TiBatchWrite(@transient val df: DataFrame, WrappedRow(row._1, row._2 + start) } - splitTableRegion(wrappedRowRdd) - generateKV(wrappedRowRdd, remove = false) + val wrappedEncodedRdd = generateKV(wrappedRowRdd, remove = false) + splitTableRegion(wrappedEncodedRdd.filter(r => !r.isIndex)) + splitIndexRegion(wrappedEncodedRdd.filter(r => r.isIndex)) + + wrappedEncodedRdd } // shuffle data in same task which belong to same region @@ -303,15 +323,16 @@ class TiBatchWrite(@transient val df: DataFrame, logger.warn("there is no data in source rdd") return } else { - takeOne(0) + val one = takeOne(0) + (one.encodedKey, one.encodedValue) } } logger.info(s"primary key: $primaryKey primary row: $primaryRow") // filter primary key - val finalWriteRDD = shuffledRDD.filter { - case (key, _) => !key.equals(primaryKey) + val finalWriteRDD = shuffledRDD.filter { wrappedEncodedRow => + !wrappedEncodedRow.encodedKey.equals(primaryKey) } val startTs = startTimeStamp.getVersion @@ -328,9 +349,11 @@ class TiBatchWrite(@transient val df: DataFrame, val ti2PCClientOnExecutor = new TwoPhaseCommitter(tiConf, startTs, options.lockTTLSeconds * 1000) - val pairs = iterator.map { - case (key, row) => - new TwoPhaseCommitter.BytePairWrapper(key.bytes, row) + val pairs = iterator.map { wrappedEncodedRow => + new TwoPhaseCommitter.BytePairWrapper( + wrappedEncodedRow.encodedKey.bytes, + wrappedEncodedRow.encodedValue + ) }.asJava ti2PCClientOnExecutor.prewriteSecondaryKeys(primaryKey.bytes, pairs) @@ -365,8 +388,8 @@ class TiBatchWrite(@transient val df: DataFrame, finalWriteRDD.foreachPartition { iterator => val ti2PCClientOnExecutor = new TwoPhaseCommitter(tiConf, startTs) - val keys = iterator.map { - case (key, _) => new TwoPhaseCommitter.ByteWrapper(key.bytes) + val keys = iterator.map { wrappedEncodedRow => + new TwoPhaseCommitter.ByteWrapper(wrappedEncodedRow.encodedKey.bytes) }.asJava try { @@ -405,7 +428,7 @@ class TiBatchWrite(@transient val df: DataFrame, } } else { if (tiContext.tiConf.isWriteWithoutLockTable) { - logger.warn("write without lock table is enabled! only for test!") + logger.warn("write without lock table enabled! only for test!") } else { throw new TiBatchWriteException("current tidb does not support LockTable or is disabled!") } @@ -423,14 +446,15 @@ class TiBatchWrite(@transient val df: DataFrame, } } else { if (tiContext.tiConf.isWriteWithoutLockTable) { - logger.warn("write without lock table is enabled! only for test!") + logger.warn("write without lock table enabled! only for test!") } else { throw new TiBatchWriteException("current tidb does not support LockTable or is disabled!") } } } - private def generateDataToBeRemovedRdd(rdd: RDD[WrappedRow], startTs: TiTimestamp) = { + private def generateDataToBeRemovedRdd(rdd: RDD[WrappedRow], + startTs: TiTimestamp): RDD[WrappedRow] = { rdd .mapPartitions { wrappedRows => val snapshot = TiSession.getInstance(tiConf).createSnapshot(startTs) @@ -526,7 +550,7 @@ class TiBatchWrite(@transient val df: DataFrame, } @throws(classOf[TiBatchWriteException]) - private def deduplicate(rdd: RDD[WrappedRow]) = { + private def deduplicate(rdd: RDD[WrappedRow]): RDD[WrappedRow] = { //1 handle key var mutableRdd = rdd if (handleCol != null) { @@ -553,31 +577,37 @@ class TiBatchWrite(@transient val df: DataFrame, } @throws(classOf[NoSuchTableException]) - private def shuffleKeyToSameRegion( - rdd: RDD[(SerializableKey, Array[Byte])] - ): RDD[(SerializableKey, Array[Byte])] = { - val tableId = tiTableInfo.getId - + private def shuffleKeyToSameRegion(rdd: RDD[WrappedEncodedRow]): RDD[WrappedEncodedRow] = { val regions = getRegions val tiRegionPartitioner = new TiRegionPartitioner(regions, options.writeConcurrency) - logger.info( - s"find ${regions.size} regions in $tiTableRef tableId: $tableId" - ) - rdd - .map(obj => (obj._1, obj._2)) + .map(obj => (obj.encodedKey, obj)) .groupByKey(tiRegionPartitioner) .map { - case (key, iterable) => + case (_, iterable) => // remove duplicate rows if key equals (should not happen, cause already deduplicated) - (key, iterable.head) + iterable.head } } private def getRegions: List[TiRegion] = { import scala.collection.JavaConversions._ - TiBatchWriteUtils.getRegionsByTable(tiSession, tiTableInfo).toList + + val tableRegion = TiBatchWriteUtils.getRegionsByTable(tiSession, tiTableInfo).toList + logger.info( + s"find ${tableRegion.size} regions in $tiTableRef tableId: ${tiTableInfo.getId}" + ) + + val indexRegion = tiTableInfo.getIndices.toList.flatMap { index => + val regions = TiBatchWriteUtils.getRegionByIndex(tiSession, tiTableInfo, index) + logger.info( + s"find ${regions.size} regions in $tiTableRef tableId: ${tiTableInfo.getId} index: ${index.getName}" + ) + regions + } + + tableRegion ++ indexRegion } private def extractHandleId(row: TiRow): Long = @@ -639,7 +669,7 @@ class TiBatchWrite(@transient val df: DataFrame, private def generateUniqueIndexKey(row: TiRow, handle: Long, index: TiIndexInfo, - remove: Boolean) = { + remove: Boolean): (SerializableKey, Array[Byte]) = { val indexKey = buildUniqueIndexKey(row, index) val value = if (remove) { new Array[Byte](0) @@ -655,7 +685,7 @@ class TiBatchWrite(@transient val df: DataFrame, private def generateSecondaryIndexKey(row: TiRow, handle: Long, index: TiIndexInfo, - remove: Boolean) = { + remove: Boolean): (SerializableKey, Array[Byte]) = { val keys = IndexKey.encodeIndexDataValues(row, index.getIndexColumns, tiTableInfo) val cdo = new CodecDataOutput() cdo.write( @@ -672,11 +702,11 @@ class TiBatchWrite(@transient val df: DataFrame, (new SerializableKey(cdo.toBytes), value) } - private def buildRowKey(row: TiRow, handle: Long) = { + private def buildRowKey(row: TiRow, handle: Long): SerializableKey = { new SerializableKey(RowKey.toRowKey(locatePhysicalTable(row), handle).getBytes) } - private def buildUniqueIndexKey(row: TiRow, index: TiIndexInfo) = { + private def buildUniqueIndexKey(row: TiRow, index: TiIndexInfo): SerializableKey = { val keys = IndexKey.encodeIndexDataValues(row, index.getIndexColumns, tiTableInfo) val indexKey = @@ -684,7 +714,9 @@ class TiBatchWrite(@transient val df: DataFrame, new SerializableKey(indexKey.getBytes) } - private def generateRowKey(row: TiRow, handle: Long, remove: Boolean) = { + private def generateRowKey(row: TiRow, + handle: Long, + remove: Boolean): (SerializableKey, Array[Byte]) = { if (remove) { ( buildRowKey(row, handle), @@ -698,17 +730,46 @@ class TiBatchWrite(@transient val df: DataFrame, } } - private def generateKV(rdd: RDD[WrappedRow], remove: Boolean) = { + private def generateKV(rdd: RDD[WrappedRow], remove: Boolean): RDD[WrappedEncodedRow] = { rdd .map { row => { - val kvBuf = mutable.ListBuffer.empty[(SerializableKey, Array[Byte])] - kvBuf += generateRowKey(row.row, row.handle, remove) + val kvBuf = mutable.ListBuffer.empty[WrappedEncodedRow] + val (encodedKey, encodedValue) = generateRowKey(row.row, row.handle, remove) + kvBuf += WrappedEncodedRow( + row.row, + row.handle, + encodedKey, + encodedValue, + isIndex = false, + -1, + remove + ) tiTableInfo.getIndices.asScala.foreach { index => if (index.isUnique) { - kvBuf += generateUniqueIndexKey(row.row, row.handle, index, remove) + val (encodedKey, encodedValue) = + generateUniqueIndexKey(row.row, row.handle, index, remove) + kvBuf += WrappedEncodedRow( + row.row, + row.handle, + encodedKey, + encodedValue, + isIndex = true, + index.getId, + remove + ) } else { - kvBuf += generateSecondaryIndexKey(row.row, row.handle, index, remove) + val (encodedKey, encodedValue) = + generateSecondaryIndexKey(row.row, row.handle, index, remove) + kvBuf += WrappedEncodedRow( + row.row, + row.handle, + encodedKey, + encodedValue, + isIndex = true, + index.getId, + remove + ) } } kvBuf @@ -723,60 +784,52 @@ class TiBatchWrite(@transient val df: DataFrame, tiTableInfo.getId } - private def estimateRegionSplitNumForIndex(wrappedRowRdd: RDD[WrappedRow], - tiIndexInfo: TiIndexInfo) = { - //TODO refine this https://github.com/pingcap/tispark/issues/891 - val rowSize = tiIndexInfo.getIndexColumnSize + private def estimateRegionSplitNum(wrappedEncodedRdd: RDD[WrappedEncodedRow]): Long = { + val totalSize = + wrappedEncodedRdd.map(r => r.encodedKey.bytes.length + r.encodedValue.length).sum() + //TODO: replace 96 with actual value read from pd https://github.com/pingcap/tispark/issues/890 - (wrappedRowRdd.count() * rowSize) / (96 * 1024 * 1024) + Math.ceil(totalSize / (tiContext.tiConf.getTikvRegionSplitSizeInMB * 1024 * 1024)).toLong } - private def estimateRegionSplitNum(wrappedRowRdd: RDD[WrappedRow]) = { - //TODO refine this https://github.com/pingcap/tispark/issues/891 - val rowSize = tiTableInfo.getEstimatedRowSizeInByte - //TODO: replace 96 with actual value read from pd https://github.com/pingcap/tispark/issues/890 - (wrappedRowRdd.count() * rowSize) / (96 * 1024 * 1024) + private def checkTidbRegionSplitContidion(minHandle: Long, + maxHandle: Long, + regionSplitNum: Long): Boolean = { + maxHandle - minHandle > regionSplitNum * 1000 } - private def splitIndexRegion(wrappedRowRdd: RDD[WrappedRow]) = { + private def splitIndexRegion(wrappedEncodedRdd: RDD[WrappedEncodedRow]): Unit = { if (options.enableRegionSplit && isEnableSplitRegion) { val indices = tiTableInfo.getIndices.asScala - val regionSplitNums = indices.map { index => - if (options.regionSplitNum != 0) { + + indices.foreach { index => + val rdd = wrappedEncodedRdd.filter(_.indexId == index.getId) + + val regionSplitNum = if (options.regionSplitNum != 0) { options.regionSplitNum } else { - estimateRegionSplitNumForIndex(wrappedRowRdd, index) + estimateRegionSplitNum(rdd) } - } - val sampledDataRDDs = - regionSplitNums.map { num => - wrappedRowRdd.takeSample(withReplacement = false, num = num.toInt) - }.toList - - indices.zipWithIndex.foreach { indexWithIdx => - val index = indexWithIdx._1 - val idx = indexWithIdx._2 - val indexCols = index.getIndexColumns - - val splitIndicesList = sampledDataRDDs(idx) - .map { value => - val colBuffer = mutable.ListBuffer.empty[String] - for (i <- 0 until indexCols.size()) { - val col = indexCols.get(i) - colBuffer += value.row.get(col.getOffset, null).toString - } - colBuffer.toList.asJava - } - .toList - .asJava - tiDBJDBCClient - .splitIndexRegion( - options.database, - options.table, - index.getName, - splitIndicesList - ) + // region split + if (regionSplitNum > 1) { + val minHandle = rdd.min().handle + val maxHandle = rdd.max().handle + if (checkTidbRegionSplitContidion(minHandle, maxHandle, regionSplitNum) || options.regionSplitNum != 0) { + logger.info("region split num=" + regionSplitNum + " index name=" + index.getName) + tiDBJDBCClient + .splitIndexRegion( + options.database, + options.table, + index.getName, + minHandle, + maxHandle, + regionSplitNum + ) + } else { + logger.warn("region split is skipped") + } + } } } } @@ -784,7 +837,7 @@ class TiBatchWrite(@transient val df: DataFrame, // when data to be inserted is too small to do region split, we check is user set region split num. // If so, we do region split as user's intention. This is also useful for writing test case. // We assume the data to be inserted is ruled by normal distribution. - private def splitTableRegion(wrappedRowRdd: RDD[WrappedRow]) = { + private def splitTableRegion(wrappedRowRdd: RDD[WrappedEncodedRow]): Unit = { if (options.enableRegionSplit && isEnableSplitRegion) { if (options.regionSplitNum != 0) { tiDBJDBCClient @@ -796,13 +849,16 @@ class TiBatchWrite(@transient val df: DataFrame, options.regionSplitNum ) } else { - val regionSplitNum = estimateRegionSplitNum(wrappedRowRdd) + val regionSplitNum = if (options.regionSplitNum != 0) { + options.regionSplitNum + } else { + estimateRegionSplitNum(wrappedRowRdd) + } // region split if (regionSplitNum > 1) { val minHandle = wrappedRowRdd.min().handle val maxHandle = wrappedRowRdd.max().handle - val isValidSplit = maxHandle - minHandle > regionSplitNum * 1000 - if (isValidSplit) { + if (checkTidbRegionSplitContidion(minHandle, maxHandle, regionSplitNum)) { logger.info("region split is enabled.") logger.info("region split num is " + regionSplitNum) tiDBJDBCClient @@ -840,8 +896,17 @@ class TiRegionPartitioner(regions: List[TiRegion], writeConcurrency: Int) extend } } -case class WrappedRow(row: TiRow, handle: Long) extends Ordered[WrappedRow] { - override def compare(that: WrappedRow): Int = this.handle.toInt - that.handle.toInt +case class WrappedRow(row: TiRow, handle: Long) + +case class WrappedEncodedRow(row: TiRow, + handle: Long, + encodedKey: SerializableKey, + encodedValue: Array[Byte], + isIndex: Boolean, + indexId: Long, + remove: Boolean) + extends Ordered[WrappedEncodedRow] { + override def compare(that: WrappedEncodedRow): Int = this.handle.toInt - that.handle.toInt } class SerializableKey(val bytes: Array[Byte]) extends Serializable { diff --git a/core/src/main/scala/com/pingcap/tispark/TiConfigConst.scala b/core/src/main/scala/com/pingcap/tispark/TiConfigConst.scala index 810618dadb..f3cb782b46 100644 --- a/core/src/main/scala/com/pingcap/tispark/TiConfigConst.scala +++ b/core/src/main/scala/com/pingcap/tispark/TiConfigConst.scala @@ -39,6 +39,7 @@ object TiConfigConst { val WRITE_ALLOW_SPARK_SQL: String = "spark.tispark.write.allow_spark_sql" val WRITE_ENABLE: String = "spark.tispark.write.enable" val WRITE_WITHOUT_LOCK_TABLE: String = "spark.tispark.write.without_lock_table" + val TIKV_REGION_SPLIT_SIZE_IN_MB: String = "spark.tispark.tikv.region_split_size_in_mb" val SNAPSHOT_ISOLATION_LEVEL: String = "SI" val READ_COMMITTED_ISOLATION_LEVEL: String = "RC" diff --git a/core/src/main/scala/com/pingcap/tispark/TiDBOptions.scala b/core/src/main/scala/com/pingcap/tispark/TiDBOptions.scala index d0f5045ab9..f48ce36860 100644 --- a/core/src/main/scala/com/pingcap/tispark/TiDBOptions.scala +++ b/core/src/main/scala/com/pingcap/tispark/TiDBOptions.scala @@ -80,8 +80,6 @@ class TiDBOptions(@transient val parameters: CaseInsensitiveMap[String]) extends val url: String = s"jdbc:mysql://address=(protocol=tcp)(host=$address)(port=$port)/?user=$user&password=$password&useSSL=false&rewriteBatchedStatements=true" - val dbtable: String = s"$database.$table" - val tiTableRef: TiTableReference = { val dbPrefix = parameters.getOrElse(TiConfigConst.DB_PREFIX, "") TiTableReference(dbPrefix + database, table) diff --git a/core/src/main/scala/com/pingcap/tispark/TiDBUtils.scala b/core/src/main/scala/com/pingcap/tispark/TiDBUtils.scala index b33e531e2a..0e49ea6cfc 100644 --- a/core/src/main/scala/com/pingcap/tispark/TiDBUtils.scala +++ b/core/src/main/scala/com/pingcap/tispark/TiDBUtils.scala @@ -15,7 +15,7 @@ object TiDBUtils { * Returns true if the table already exists in the TiDB. */ def tableExists(conn: Connection, options: TiDBOptions): Boolean = { - val sql = s"SELECT * FROM ${options.dbtable} WHERE 1=0" + val sql = s"SELECT * FROM `${options.database}`.`${options.table}` WHERE 1=0" Try { val statement = conn.prepareStatement(sql) try { diff --git a/core/src/main/scala/com/pingcap/tispark/TiDBWriter.scala b/core/src/main/scala/com/pingcap/tispark/TiDBWriter.scala index 2b4cab6beb..a2c2dbf89e 100644 --- a/core/src/main/scala/com/pingcap/tispark/TiDBWriter.scala +++ b/core/src/main/scala/com/pingcap/tispark/TiDBWriter.scala @@ -25,7 +25,9 @@ object TiDBWriter { ) } } else { - throw new TiBatchWriteException(s"table ${options.dbtable} does not exists!") + throw new TiBatchWriteException( + s"table `${options.database}`.`${options.table}` does not exists!" + ) // TiDBUtils.createTable(conn, df, options, tiContext) // TiDBUtils.saveTable(tiContext, df, Some(df.schema), options) } diff --git a/core/src/main/scala/com/pingcap/tispark/utils/TiUtil.scala b/core/src/main/scala/com/pingcap/tispark/utils/TiUtil.scala index 732ed77c33..7a9335bc77 100644 --- a/core/src/main/scala/com/pingcap/tispark/utils/TiUtil.scala +++ b/core/src/main/scala/com/pingcap/tispark/utils/TiUtil.scala @@ -188,6 +188,10 @@ object TiUtil { tiConf.setWriteAllowSparkSQL(conf.get(TiConfigConst.WRITE_ALLOW_SPARK_SQL).toBoolean) } + if (conf.contains(TiConfigConst.TIKV_REGION_SPLIT_SIZE_IN_MB)) { + tiConf.setTikvRegionSplitSizeInMB(conf.get(TiConfigConst.TIKV_REGION_SPLIT_SIZE_IN_MB).toInt) + } + tiConf } diff --git a/core/src/test/scala/com/pingcap/tispark/TiBatchWriteSuite.scala b/core/src/test/scala/com/pingcap/tispark/TiBatchWriteSuite.scala index 9036c07e32..646b32cc29 100644 --- a/core/src/test/scala/com/pingcap/tispark/TiBatchWriteSuite.scala +++ b/core/src/test/scala/com/pingcap/tispark/TiBatchWriteSuite.scala @@ -33,24 +33,25 @@ class TiBatchWriteSuite extends BaseTiSparkTest { "SUPPLIER" :: Nil - private val batchWriteTablePrefix = "BATCH_WRITE" + private val batchWriteTablePrefix = "BATCH.WRITE" override def beforeAll(): Unit = { super.beforeAll() database = tpchDBName setCurrentDatabase(database) for (table <- tables) { - tidbStmt.execute(s"drop table if exists ${batchWriteTablePrefix}_$table") - tidbStmt.execute(s"create table if not exists ${batchWriteTablePrefix}_$table like $table ") + val tableToWrite = s"${batchWriteTablePrefix}_$table" + tidbStmt.execute(s"drop table if exists `$tableToWrite`") + tidbStmt.execute(s"create table if not exists `$tableToWrite` like $table ") } } test("ti batch write") { for (table <- tables) { - println(table) + val tableToWrite = s"${batchWriteTablePrefix}_$table" // select - refreshConnections(TestTables(database, s"${batchWriteTablePrefix}_$table")) + refreshConnections(TestTables(database, tableToWrite)) val df = sql(s"select * from $table") // batch write @@ -58,21 +59,21 @@ class TiBatchWriteSuite extends BaseTiSparkTest { df, ti, new TiDBOptions( - tidbOptions + ("database" -> s"$database", "table" -> s"${batchWriteTablePrefix}_$table") + tidbOptions + ("database" -> s"$database", "table" -> tableToWrite) ) ) // refresh - refreshConnections(TestTables(database, s"${batchWriteTablePrefix}_$table")) + refreshConnections(TestTables(database, tableToWrite)) setCurrentDatabase(database) // select - queryTiDBViaJDBC(s"select * from ${batchWriteTablePrefix}_$table") + queryTiDBViaJDBC(s"select * from `$tableToWrite`") // assert val originCount = queryViaTiSpark(s"select count(*) from $table").head.head.asInstanceOf[Long] // cannot use count since batch write is not support index writing yet. - val count = queryViaTiSpark(s"select * from ${batchWriteTablePrefix}_$table").length + val count = queryViaTiSpark(s"select * from `$tableToWrite`").length .asInstanceOf[Long] assert(count == originCount) } @@ -98,7 +99,8 @@ class TiBatchWriteSuite extends BaseTiSparkTest { try { setCurrentDatabase(database) for (table <- tables) { - tidbStmt.execute(s"drop table if exists ${batchWriteTablePrefix}_$table") + val tableToWrite = s"${batchWriteTablePrefix}_$table" + tidbStmt.execute(s"drop table if exists `$tableToWrite`") } } finally { super.afterAll() diff --git a/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala b/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala index d7939dc638..e8cddca847 100644 --- a/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala +++ b/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala @@ -21,7 +21,7 @@ class BaseDataSourceTest(val table: String, val database: String = "tispark_test", val _enableTidbConfigPropertiesInjectedToSpark: Boolean = true) extends BaseTiSparkTest { - protected def dbtable = s"$database.$table" + protected def dbtable = s"`$database`.`$table`" override def beforeAll(): Unit = { enableTidbConfigPropertiesInjectedToSpark = _enableTidbConfigPropertiesInjectedToSpark diff --git a/core/src/test/scala/com/pingcap/tispark/datasource/InsertSuite.scala b/core/src/test/scala/com/pingcap/tispark/datasource/InsertSuite.scala index 403021223d..0fddee5bf7 100644 --- a/core/src/test/scala/com/pingcap/tispark/datasource/InsertSuite.scala +++ b/core/src/test/scala/com/pingcap/tispark/datasource/InsertSuite.scala @@ -6,7 +6,7 @@ import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructT import scala.collection.mutable.ArrayBuffer -class InsertSuite extends BaseDataSourceTest("test_datasource_insert") { +class InsertSuite extends BaseDataSourceTest("test.datasource_insert") { private val row1 = Row(null, "Hello") private val row5 = Row(5, "Duplicate") diff --git a/core/src/test/scala/com/pingcap/tispark/datasource/RegionSplitSuite.scala b/core/src/test/scala/com/pingcap/tispark/datasource/RegionSplitSuite.scala index 732aa52382..d99bbf5292 100644 --- a/core/src/test/scala/com/pingcap/tispark/datasource/RegionSplitSuite.scala +++ b/core/src/test/scala/com/pingcap/tispark/datasource/RegionSplitSuite.scala @@ -34,7 +34,7 @@ class RegionSplitSuite extends BaseDataSourceTest("region_split_test") { val regionsNum = TiBatchWriteUtils .getRegionByIndex(ti.tiSession, tiTableInfo, tiTableInfo.getIndices.get(0)) .size() - assert(regionsNum == 4) + assert(regionsNum == 3) } test("table region split test") { @@ -50,7 +50,7 @@ class RegionSplitSuite extends BaseDataSourceTest("region_split_test") { val options = Some(Map("enableRegionSplit" -> "true", "regionSplitNum" -> "3")) - tidbWrite(List(row1), schema, options) + tidbWrite(List(row1, row2, row3), schema, options) val tiTableInfo = ti.tiSession.getCatalog.getTable(dbPrefix + database, table) diff --git a/tikv-client/src/main/java/com/pingcap/tikv/TiConfiguration.java b/tikv-client/src/main/java/com/pingcap/tikv/TiConfiguration.java index db1e6f2c46..6f128c10fc 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/TiConfiguration.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/TiConfiguration.java @@ -48,6 +48,7 @@ public class TiConfiguration implements Serializable { private static final boolean DEF_WRITE_ENABLE = true; private static final boolean DEF_WRITE_ALLOW_SPARK_SQL = false; private static final boolean DEF_WRITE_WITHOUT_LOCK_TABLE = false; + private static final int DEF_TIKV_REGION_SPLIT_SIZE_IN_MB = 96; private int timeout = DEF_TIMEOUT; private TimeUnit timeoutUnit = DEF_TIMEOUT_UNIT; @@ -69,6 +70,7 @@ public class TiConfiguration implements Serializable { private boolean writeAllowSparkSQL = DEF_WRITE_ALLOW_SPARK_SQL; private boolean writeEnable = DEF_WRITE_ENABLE; private boolean writeWithoutLockTable = DEF_WRITE_WITHOUT_LOCK_TABLE; + private int tikvRegionSplitSizeInMB = DEF_TIKV_REGION_SPLIT_SIZE_IN_MB; public static TiConfiguration createDefault(String pdAddrsStr) { Objects.requireNonNull(pdAddrsStr, "pdAddrsStr is null"); @@ -262,4 +264,12 @@ public boolean isWriteAllowSparkSQL() { public void setWriteAllowSparkSQL(boolean writeAllowSparkSQL) { this.writeAllowSparkSQL = writeAllowSparkSQL; } + + public void setTikvRegionSplitSizeInMB(int tikvRegionSplitSizeInMB) { + this.tikvRegionSplitSizeInMB = tikvRegionSplitSizeInMB; + } + + public int getTikvRegionSplitSizeInMB() { + return tikvRegionSplitSizeInMB; + } } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/TiDBJDBCClient.java b/tikv-client/src/main/java/com/pingcap/tikv/TiDBJDBCClient.java index 339140f1d7..70bf4db8a8 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/TiDBJDBCClient.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/TiDBJDBCClient.java @@ -107,14 +107,40 @@ public void splitTableRegion( try (Statement tidbStmt = connection.createStatement()) { String sql = String.format( - "split table %s.%s between (%d) and (%d) regions %d", + "split table `%s`.`%s` between (%d) and (%d) regions %d", dbName, tblName, minVal, maxVal, regionNum); + logger.info("split table region: " + sql); tidbStmt.execute(sql); } catch (Exception ignored) { logger.warn("failed to split table region"); } } + /** + * split index region by calling tidb jdbc command `SPLIT TABLE`, e.g. SPLIT TABLE t INDEX idx + * BETWEEN (-9223372036854775808) AND (9223372036854775807) REGIONS 16; + * + * @param dbName database name in tidb + * @param tblName table name in tidb + * @param idxName index name in table + * @param minVal min value + * @param maxVal max value + * @param regionNum number of regions to split + */ + public void splitIndexRegion( + String dbName, String tblName, String idxName, long minVal, long maxVal, long regionNum) { + try (Statement tidbStmt = connection.createStatement()) { + String sql = + String.format( + "split table `%s`.`%s` index %s between (%d) and (%d) regions %d", + dbName, tblName, idxName, minVal, maxVal, regionNum); + logger.info("split index region: " + sql); + tidbStmt.execute(sql); + } catch (Exception ignored) { + logger.warn("failed to split index region"); + } + } + /** * split index region by calling tidb jdbc command `SPLIT TABLE`, e.g. SPLIT TABLE t1 INDEX idx4 * by ("a", "2000-01-01 00:00:01"), ("b", "2019-04-17 14:26:19"), ("c", ""); if you have a table From 73d03693ad8367bd1ee41f41d6532d1b17370c57 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Tue, 13 Aug 2019 13:20:07 +0800 Subject: [PATCH 32/62] add ci simple mode (#1012) --- .ci/integration_test.groovy | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/.ci/integration_test.groovy b/.ci/integration_test.groovy index c0f83f3da3..81a7278880 100644 --- a/.ci/integration_test.groovy +++ b/.ci/integration_test.groovy @@ -7,6 +7,7 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb def TIKV_BRANCH = "master" def PD_BRANCH = "master" def MVN_PROFILE = "-Pjenkins" + def TEST_MODE = "simple" def PARALLEL_NUMBER = 18 // parse tidb branch @@ -16,6 +17,7 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb } m1 = null println "TIDB_BRANCH=${TIDB_BRANCH}" + // parse pd branch def m2 = ghprbCommentBody =~ /pd\s*=\s*([^\s\\]+)(\s|\\|$)/ if (m2) { @@ -23,6 +25,7 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb } m2 = null println "PD_BRANCH=${PD_BRANCH}" + // parse tikv branch def m3 = ghprbCommentBody =~ /tikv\s*=\s*([^\s\\]+)(\s|\\|$)/ if (m3) { @@ -30,11 +33,18 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb } m3 = null println "TIKV_BRANCH=${TIKV_BRANCH}" + // parse mvn profile def m4 = ghprbCommentBody =~ /profile\s*=\s*([^\s\\]+)(\s|\\|$)/ if (m4) { MVN_PROFILE = MVN_PROFILE + " -P${m4[0][1]}" } + + // parse test mode + def m5 = ghprbCommentBody =~ /mode\s*=\s*([^\s\\]+)(\s|\\|$)/ + if (m5) { + TEST_MODE = "${m5[0][1]}" + } def readfile = { filename -> def file = readFile filename @@ -93,7 +103,15 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb find core/src -name '*Suite*' | grep -v 'MultiColumnPKDataTypeSuite' > test shuf test -o test2 mv test2 test - find core/src -name '*MultiColumnPKDataTypeSuite*' >> test + """ + + if(TEST_MODE != "simple") { + sh """ + find core/src -name '*MultiColumnPKDataTypeSuite*' >> test + """ + } + + sh """ sed -i 's/core\\/src\\/test\\/scala\\///g' test sed -i 's/\\//\\./g' test sed -i 's/\\.scala//g' test From b2fcfd5093b05140f90da5837c77d93c4f1eb864 Mon Sep 17 00:00:00 2001 From: birdstorm Date: Tue, 13 Aug 2019 15:23:09 +0800 Subject: [PATCH 33/62] clean up redundant code (#997) --- .ci/build.groovy | 1 - .ci/integration_test.groovy | 11 +- .../com/pingcap/tispark/TiDBRelation.scala | 10 +- .../tispark/statistics/StatisticsHelper.scala | 32 +++--- .../spark/sql/execution/CoprocessorRDD.scala | 102 ++++++------------ .../spark/sql/tispark/TiHandleRDD.scala | 38 ++----- .../org/apache/spark/sql/tispark/TiRDD.scala | 55 ++-------- .../apache/spark/sql/tispark/TiRowRDD.scala | 86 +++++++++++++++ .../com/pingcap/tikv/statistics/Bucket.java | 1 - .../tikv/types/AbstractDateTimeType.java | 15 +++ .../com/pingcap/tikv/types/DateTimeType.java | 15 --- .../com/pingcap/tikv/types/TimestampType.java | 9 +- 12 files changed, 169 insertions(+), 206 deletions(-) create mode 100644 core/src/main/scala/org/apache/spark/sql/tispark/TiRowRDD.scala diff --git a/.ci/build.groovy b/.ci/build.groovy index cbcfd228be..cad2e4409c 100644 --- a/.ci/build.groovy +++ b/.ci/build.groovy @@ -7,7 +7,6 @@ def call(ghprbActualCommit, ghprbPullId, ghprbPullTitle, ghprbPullLink, ghprbPul catchError { node ('build') { - def ws = pwd() deleteDir() container("java") { stage('Checkout') { diff --git a/.ci/integration_test.groovy b/.ci/integration_test.groovy index 81a7278880..9177f7f02e 100644 --- a/.ci/integration_test.groovy +++ b/.ci/integration_test.groovy @@ -15,7 +15,6 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb if (m1) { TIDB_BRANCH = "${m1[0][1]}" } - m1 = null println "TIDB_BRANCH=${TIDB_BRANCH}" // parse pd branch @@ -23,7 +22,6 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb if (m2) { PD_BRANCH = "${m2[0][1]}" } - m2 = null println "PD_BRANCH=${PD_BRANCH}" // parse tikv branch @@ -31,7 +29,6 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb if (m3) { TIKV_BRANCH = "${m3[0][1]}" } - m3 = null println "TIKV_BRANCH=${TIKV_BRANCH}" // parse mvn profile @@ -51,10 +48,6 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb return file.split("\n") as List } - def remove_last_str = { str -> - return str.substring(0, str.length() - 1) - } - def get_mvn_str = { total_chunks -> def mvnStr = " -DwildcardSuites=" for (int i = 0 ; i < total_chunks.size() - 1; i++) { @@ -75,8 +68,7 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb println "${NODE_NAME}" container("golang") { deleteDir() - def ws = pwd() - + // tidb def tidb_sha1 = sh(returnStdout: true, script: "curl ${FILE_SERVER_URL}/download/refs/pingcap/tidb/${TIDB_BRANCH}/sha1").trim() sh "curl ${FILE_SERVER_URL}/download/builds/pingcap/tidb/${tidb_sha1}/centos7/tidb-server.tar.gz | tar xz" @@ -184,7 +176,6 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb node("test_java") { println "${NODE_NAME}" container("java") { - def ws = pwd() deleteDir() unstash 'binaries' unstash 'tispark' diff --git a/core/src/main/scala/com/pingcap/tispark/TiDBRelation.scala b/core/src/main/scala/com/pingcap/tispark/TiDBRelation.scala index e4fb457c09..4bacc40d33 100644 --- a/core/src/main/scala/com/pingcap/tispark/TiDBRelation.scala +++ b/core/src/main/scala/com/pingcap/tispark/TiDBRelation.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.expressions.{Attribute, NamedExpression} import org.apache.spark.sql.execution._ import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation} -import org.apache.spark.sql.tispark.{TiHandleRDD, TiRDD} +import org.apache.spark.sql.tispark.{TiHandleRDD, TiRDD, TiRowRDD} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} @@ -45,13 +45,13 @@ case class TiDBRelation(session: TiSession, override def sizeInBytes: Long = tableRef.sizeInBytes - def logicalPlanToRDD(dagRequest: TiDAGRequest): List[TiRDD] = { + def logicalPlanToRDD(dagRequest: TiDAGRequest): List[TiRowRDD] = { import scala.collection.JavaConverters._ val ids = dagRequest.getIds.asScala - var tiRDDs = new ListBuffer[TiRDD] + var tiRDDs = new ListBuffer[TiRowRDD] ids.foreach( id => { - tiRDDs += new TiRDD( + tiRDDs += new TiRowRDD( dagRequest, id, session.getConf, @@ -65,7 +65,6 @@ case class TiDBRelation(session: TiSession, } def dagRequestToRegionTaskExec(dagRequest: TiDAGRequest, output: Seq[Attribute]): SparkPlan = { - val timestamp = dagRequest.getStartTs import scala.collection.JavaConverters._ val ids = dagRequest.getIds.asScala var tiHandleRDDs = new ListBuffer[TiHandleRDD]() @@ -77,7 +76,6 @@ case class TiDBRelation(session: TiSession, id, session.getConf, tableRef, - timestamp, session, sqlContext.sparkSession ) diff --git a/core/src/main/scala/com/pingcap/tispark/statistics/StatisticsHelper.scala b/core/src/main/scala/com/pingcap/tispark/statistics/StatisticsHelper.scala index f1572c3d5d..faa5d54c03 100644 --- a/core/src/main/scala/com/pingcap/tispark/statistics/StatisticsHelper.scala +++ b/core/src/main/scala/com/pingcap/tispark/statistics/StatisticsHelper.scala @@ -212,38 +212,36 @@ object StatisticsHelper { } } - private[statistics] def buildHistogramsRequest(histTable: TiTableInfo, - targetTblId: Long, - startTs: TiTimestamp): TiDAGRequest = + private def checkColExists(table: TiTableInfo, column: String): Boolean = + table.getColumns.exists { _.matchName(column) } + + private def buildRequest(tableInfo: TiTableInfo, + requiredCols: Seq[String], + targetTblId: Long, + startTs: TiTimestamp): TiDAGRequest = { TiDAGRequest.Builder .newBuilder() - .setFullTableScan(histTable) + .setFullTableScan(tableInfo) .addFilter( ComparisonBinaryExpression .equal(ColumnRef.create("table_id"), Constant.create(targetTblId)) ) .addRequiredCols( - histRequiredCols.filter(checkColExists(histTable, _)) + requiredCols.filter(checkColExists(tableInfo, _)) ) .setStartTs(startTs) .build(PushDownType.NORMAL) + } - private def checkColExists(table: TiTableInfo, column: String): Boolean = - table.getColumns.exists { _.matchName(column) } + private[statistics] def buildHistogramsRequest(histTable: TiTableInfo, + targetTblId: Long, + startTs: TiTimestamp): TiDAGRequest = + buildRequest(histTable, histRequiredCols, targetTblId, startTs) private[statistics] def buildMetaRequest(metaTable: TiTableInfo, targetTblId: Long, startTs: TiTimestamp): TiDAGRequest = - TiDAGRequest.Builder - .newBuilder() - .setFullTableScan(metaTable) - .addFilter( - ComparisonBinaryExpression - .equal(ColumnRef.create("table_id"), Constant.create(targetTblId)) - ) - .addRequiredCols(metaRequiredCols.filter(checkColExists(metaTable, _))) - .setStartTs(startTs) - .build(PushDownType.NORMAL) + buildRequest(metaTable, metaRequiredCols, targetTblId, startTs) private[statistics] def buildBucketRequest(bucketTable: TiTableInfo, targetTblId: Long, diff --git a/core/src/main/scala/org/apache/spark/sql/execution/CoprocessorRDD.scala b/core/src/main/scala/org/apache/spark/sql/execution/CoprocessorRDD.scala index e5074084b0..a709f5b402 100644 --- a/core/src/main/scala/org/apache/spark/sql/execution/CoprocessorRDD.scala +++ b/core/src/main/scala/org/apache/spark/sql/execution/CoprocessorRDD.scala @@ -36,7 +36,7 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericInternalRow, import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} -import org.apache.spark.sql.tispark.{TiHandleRDD, TiRDD} +import org.apache.spark.sql.tispark.{TiHandleRDD, TiRDD, TiRowRDD} import org.apache.spark.sql.types.{ArrayType, DataType, LongType, Metadata} import org.apache.spark.sql.{Row, SparkSession} import org.tikv.kvproto.Coprocessor.KeyRange @@ -44,21 +44,15 @@ import org.tikv.kvproto.Coprocessor.KeyRange import scala.collection.JavaConversions._ import scala.collection.mutable -case class CoprocessorRDD(output: Seq[Attribute], tiRDDs: List[TiRDD]) extends LeafExecNode { - - override lazy val metrics: Map[String, SQLMetric] = Map( - "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows") - ) - - override val nodeName: String = "CoprocessorRDD" +trait LeafExecRDD extends LeafExecNode { override val outputPartitioning: Partitioning = UnknownPartitioning(0) - override val outputOrdering: Seq[SortOrder] = Nil + private[execution] val tiRDDs: List[TiRDD] - private val internalRDDs: List[RDD[InternalRow]] = + private[execution] val internalRDDs: List[RDD[InternalRow]] = tiRDDs.map(rdd => RDDConversions.rowToRowRdd(rdd, output.map(_.dataType))) - private lazy val project = UnsafeProjection.create(schema) + private[execution] lazy val project = UnsafeProjection.create(schema) - private def internalRowToUnsafeRowWithIndex( + private[execution] def internalRowToUnsafeRowWithIndex( numOutputRows: SQLMetric ): (Int, Iterator[InternalRow]) => Iterator[UnsafeRow] = (index, iter) => { @@ -69,26 +63,11 @@ case class CoprocessorRDD(output: Seq[Attribute], tiRDDs: List[TiRDD]) extends L } } - protected override def doExecute(): RDD[InternalRow] = { - val numOutputRows = longMetric("numOutputRows") - - internalRDDs - .map( - rdd => - ReflectionMapPartitionWithIndexInternal( - rdd, - internalRowToUnsafeRowWithIndex(numOutputRows) - ).invoke() - ) - .reduce(_ union _) - - } - def dagRequest: TiDAGRequest = tiRDDs.head.dagRequest override def verboseString: String = if (tiRDDs.size > 1) { - val b = new StringBuilder + val b = new mutable.StringBuilder() b.append(s"TiSpark $nodeName on partition table:\n") tiRDDs.zipWithIndex.map { case (_, i) => b.append(s"partition p$i") @@ -96,44 +75,51 @@ case class CoprocessorRDD(output: Seq[Attribute], tiRDDs: List[TiRDD]) extends L b.append(s"with dag request: $dagRequest") b.toString() } else { - s"TiSpark $nodeName{$dagRequest}" + + s"TiDB $nodeName{$dagRequest}" + s"${TiUtil.getReqEstCountStr(dagRequest)}" - } override def simpleString: String = verboseString } +case class CoprocessorRDD(output: Seq[Attribute], tiRDDs: List[TiRowRDD]) extends LeafExecRDD { + + override lazy val metrics: Map[String, SQLMetric] = Map( + "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows") + ) + + override val nodeName: String = "CoprocessorRDD" + + override protected def doExecute(): RDD[InternalRow] = { + val numOutputRows = longMetric("numOutputRows") + + internalRDDs + .map( + rdd => + ReflectionMapPartitionWithIndexInternal( + rdd, + internalRowToUnsafeRowWithIndex(numOutputRows) + ).invoke() + ) + .reduce(_ union _) + } + + override def simpleString: String = verboseString +} + /** * HandleRDDExec is used for scanning handles from TiKV as a LeafExecNode in index plan. * Providing handle scan via a TiHandleRDD. * - * @param tiHandleRDDs handle source + * @param tiRDDs handle source */ -case class HandleRDDExec(tiHandleRDDs: List[TiHandleRDD]) extends LeafExecNode { +case class HandleRDDExec(tiRDDs: List[TiHandleRDD]) extends LeafExecRDD { override val nodeName: String = "HandleRDD" override lazy val metrics: Map[String, SQLMetric] = Map( "numOutputRegions" -> SQLMetrics.createMetric(sparkContext, "number of regions") ) - override val outputPartitioning: Partitioning = UnknownPartitioning(0) - - private val internalRDDs: List[RDD[InternalRow]] = - tiHandleRDDs.map(rdd => RDDConversions.rowToRowRdd(rdd, output.map(_.dataType))) - private lazy val project = UnsafeProjection.create(schema) - - private def internalRowToUnsafeRowWithIndex( - numOutputRegions: SQLMetric - ): (Int, Iterator[InternalRow]) => Iterator[UnsafeRow] = - (index, iter) => { - project.initialize(index) - iter.map { r => - numOutputRegions += 1 - project(r) - } - } - override protected def doExecute(): RDD[InternalRow] = { val numOutputRegions = longMetric("numOutputRegions") @@ -159,24 +145,6 @@ case class HandleRDDExec(tiHandleRDDs: List[TiHandleRDD]) extends LeafExecNode { ) override def output: Seq[Attribute] = attributeRef - - def dagRequest: TiDAGRequest = tiHandleRDDs.head.dagRequest - - override def verboseString: String = - if (tiHandleRDDs.size > 1) { - val b = new mutable.StringBuilder() - b.append(s"TiSpark $nodeName on partition table:\n") - tiHandleRDDs.zipWithIndex.map { - case (_, i) => b.append(s"partition p$i") - } - b.append(s"with dag request: $dagRequest") - b.toString() - } else { - s"TiDB $nodeName{$dagRequest}" + - s"${TiUtil.getReqEstCountStr(dagRequest)}" - } - - override def simpleString: String = verboseString } /** diff --git a/core/src/main/scala/org/apache/spark/sql/tispark/TiHandleRDD.scala b/core/src/main/scala/org/apache/spark/sql/tispark/TiHandleRDD.scala index baddbd3c20..8f40f955b3 100644 --- a/core/src/main/scala/org/apache/spark/sql/tispark/TiHandleRDD.scala +++ b/core/src/main/scala/org/apache/spark/sql/tispark/TiHandleRDD.scala @@ -39,21 +39,20 @@ import scala.collection.mutable.ListBuffer * is a list of primitive long which represents the handles lie in that region. * */ -class TiHandleRDD(val dagRequest: TiDAGRequest, - val physicalId: Long, - val tiConf: TiConfiguration, - val tableRef: TiTableReference, - val ts: TiTimestamp, +class TiHandleRDD(override val dagRequest: TiDAGRequest, + override val physicalId: Long, + override val tiConf: TiConfiguration, + override val tableRef: TiTableReference, @transient private val session: TiSession, @transient private val sparkSession: SparkSession) - extends RDD[Row](sparkSession.sparkContext, Nil) { + extends TiRDD(dagRequest, physicalId, tiConf, tableRef, session, sparkSession) { override def compute(split: Partition, context: TaskContext): Iterator[Row] = new Iterator[Row] { dagRequest.resolve() private val tiPartition = split.asInstanceOf[TiPartition] private val session = TiSession.getInstance(tiConf) - private val snapshot = session.createSnapshot(ts) + private val snapshot = session.createSnapshot(dagRequest.getStartTs) private[this] val tasks = tiPartition.tasks private val handleIterator = snapshot.indexHandleRead(dagRequest, tasks) @@ -96,29 +95,4 @@ class TiHandleRDD(val dagRequest: TiDAGRequest, Row.apply(regionId, handleList.toArray()) } } - - override protected def getPartitions: Array[Partition] = { - val keyWithRegionTasks = RangeSplitter - .newSplitter(session.getRegionManager) - .splitRangeByRegion(dagRequest.getRangesByPhysicalId(physicalId)) - - val hostTasksMap = new mutable.HashMap[String, mutable.Set[RegionTask]] - with mutable.MultiMap[String, RegionTask] - - var index = 0 - val result = new ListBuffer[TiPartition] - for (task <- keyWithRegionTasks) { - hostTasksMap.addBinding(task.getHost, task) - val tasks = hostTasksMap(task.getHost) - result.append(new TiPartition(index, tasks.toSeq, sparkContext.applicationId)) - index += 1 - hostTasksMap.remove(task.getHost) - } - // add rest - for (tasks <- hostTasksMap.values) { - result.append(new TiPartition(index, tasks.toSeq, sparkContext.applicationId)) - index += 1 - } - result.toArray - } } diff --git a/core/src/main/scala/org/apache/spark/sql/tispark/TiRDD.scala b/core/src/main/scala/org/apache/spark/sql/tispark/TiRDD.scala index a5b4ab769a..92b8322f23 100644 --- a/core/src/main/scala/org/apache/spark/sql/tispark/TiRDD.scala +++ b/core/src/main/scala/org/apache/spark/sql/tispark/TiRDD.scala @@ -33,57 +33,14 @@ import scala.collection.JavaConversions._ import scala.collection.mutable import scala.collection.mutable.ListBuffer -class TiRDD(val dagRequest: TiDAGRequest, - val physicalId: Long, - val tiConf: TiConfiguration, - val tableRef: TiTableReference, - @transient private val session: TiSession, - @transient private val sparkSession: SparkSession) +abstract class TiRDD(val dagRequest: TiDAGRequest, + val physicalId: Long, + val tiConf: TiConfiguration, + val tableRef: TiTableReference, + @transient private val session: TiSession, + @transient private val sparkSession: SparkSession) extends RDD[Row](sparkSession.sparkContext, Nil) { - type TiRow = com.pingcap.tikv.row.Row - - @transient lazy val (_: List[DataType], rowTransformer: RowTransformer) = - initializeSchema() - - def initializeSchema(): (List[DataType], RowTransformer) = { - val schemaInferrer: SchemaInfer = SchemaInfer.create(dagRequest) - val rowTransformer: RowTransformer = schemaInferrer.getRowTransformer - (schemaInferrer.getTypes.toList, rowTransformer) - } - - // cache invalidation call back function - // used for driver to update PD cache - private val callBackFunc = CacheInvalidateListener.getInstance() - - override def compute(split: Partition, context: TaskContext): Iterator[Row] = new Iterator[Row] { - dagRequest.resolve() - - // bypass, sum return a long type - private val tiPartition = split.asInstanceOf[TiPartition] - private val session = TiSession.getInstance(tiConf) - session.injectCallBackFunc(callBackFunc) - private val snapshot = session.createSnapshot(dagRequest.getStartTs) - private[this] val tasks = tiPartition.tasks - - private val iterator = snapshot.tableRead(dagRequest, tasks) - - override def hasNext: Boolean = { - // Kill the task in case it has been marked as killed. This logic is from - // Interrupted Iterator, but we inline it here instead of wrapping the iterator in order - // to avoid performance overhead. - if (context.isInterrupted()) { - throw new TaskKilledException - } - iterator.hasNext - } - - override def next(): Row = TiConverter.toSparkRow(iterator.next, rowTransformer) - } - - override protected def getPreferredLocations(split: Partition): Seq[String] = - split.asInstanceOf[TiPartition].tasks.head.getHost :: Nil - override protected def getPartitions: Array[Partition] = { val keyWithRegionTasks = RangeSplitter .newSplitter(session.getRegionManager) diff --git a/core/src/main/scala/org/apache/spark/sql/tispark/TiRowRDD.scala b/core/src/main/scala/org/apache/spark/sql/tispark/TiRowRDD.scala new file mode 100644 index 0000000000..c5e13b7f2f --- /dev/null +++ b/core/src/main/scala/org/apache/spark/sql/tispark/TiRowRDD.scala @@ -0,0 +1,86 @@ +/* + * Copyright 2017 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.tispark + +import com.pingcap.tikv._ +import com.pingcap.tikv.meta.TiDAGRequest +import com.pingcap.tikv.operation.SchemaInfer +import com.pingcap.tikv.operation.transformer.RowTransformer +import com.pingcap.tikv.types.DataType +import com.pingcap.tikv.util.RangeSplitter +import com.pingcap.tikv.util.RangeSplitter.RegionTask +import com.pingcap.tispark.listener.CacheInvalidateListener +import com.pingcap.tispark.utils.TiConverter +import com.pingcap.tispark.{TiPartition, TiTableReference} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.{Partition, TaskContext, TaskKilledException} + +import scala.collection.JavaConversions._ +import scala.collection.mutable +import scala.collection.mutable.ListBuffer + +class TiRowRDD(override val dagRequest: TiDAGRequest, + override val physicalId: Long, + override val tiConf: TiConfiguration, + override val tableRef: TiTableReference, + @transient private val session: TiSession, + @transient private val sparkSession: SparkSession) + extends TiRDD(dagRequest, physicalId, tiConf, tableRef, session, sparkSession) { + + type TiRow = com.pingcap.tikv.row.Row + + @transient lazy val (_: List[DataType], rowTransformer: RowTransformer) = + initializeSchema() + + def initializeSchema(): (List[DataType], RowTransformer) = { + val schemaInferrer: SchemaInfer = SchemaInfer.create(dagRequest) + val rowTransformer: RowTransformer = schemaInferrer.getRowTransformer + (schemaInferrer.getTypes.toList, rowTransformer) + } + + // cache invalidation call back function + // used for driver to update PD cache + private val callBackFunc = CacheInvalidateListener.getInstance() + + override def compute(split: Partition, context: TaskContext): Iterator[Row] = new Iterator[Row] { + dagRequest.resolve() + + // bypass, sum return a long type + private val tiPartition = split.asInstanceOf[TiPartition] + private val session = TiSession.getInstance(tiConf) + session.injectCallBackFunc(callBackFunc) + private val snapshot = session.createSnapshot(dagRequest.getStartTs) + private[this] val tasks = tiPartition.tasks + + private val iterator = snapshot.tableRead(dagRequest, tasks) + + override def hasNext: Boolean = { + // Kill the task in case it has been marked as killed. This logic is from + // Interrupted Iterator, but we inline it here instead of wrapping the iterator in order + // to avoid performance overhead. + if (context.isInterrupted()) { + throw new TaskKilledException + } + iterator.hasNext + } + + override def next(): Row = TiConverter.toSparkRow(iterator.next, rowTransformer) + } + + override protected def getPreferredLocations(split: Partition): Seq[String] = + split.asInstanceOf[TiPartition].tasks.head.getHost :: Nil +} diff --git a/tikv-client/src/main/java/com/pingcap/tikv/statistics/Bucket.java b/tikv-client/src/main/java/com/pingcap/tikv/statistics/Bucket.java index 9448e1b86d..06ba7dc528 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/statistics/Bucket.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/statistics/Bucket.java @@ -56,7 +56,6 @@ public Bucket(Key upperBound) { } @Override - @SuppressWarnings("unchecked") public int compareTo(Bucket b) { return upperBound.compareTo(b.upperBound); } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/types/AbstractDateTimeType.java b/tikv-client/src/main/java/com/pingcap/tikv/types/AbstractDateTimeType.java index f6b00a3774..1dbc16db68 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/types/AbstractDateTimeType.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/types/AbstractDateTimeType.java @@ -7,6 +7,7 @@ import com.pingcap.tikv.codec.Codec.DateTimeCodec; import com.pingcap.tikv.codec.CodecDataInput; import com.pingcap.tikv.codec.CodecDataOutput; +import com.pingcap.tikv.exception.ConvertNotSupportException; import com.pingcap.tikv.exception.InvalidCodecFormatException; import com.pingcap.tikv.meta.TiColumnInfo.InternalTypeHolder; import org.joda.time.DateTimeZone; @@ -79,4 +80,18 @@ protected void encodeProto(CodecDataOutput cdo, Object value) { public ExprType getProtoExprType() { return ExprType.MysqlTime; } + + java.sql.Timestamp convertToMysqlDateTime(Object value) throws ConvertNotSupportException { + java.sql.Timestamp result; + if (value instanceof String) { + result = java.sql.Timestamp.valueOf((String) value); + } else if (value instanceof java.sql.Date) { + result = new java.sql.Timestamp(((java.sql.Date) value).getTime()); + } else if (value instanceof java.sql.Timestamp) { + result = (java.sql.Timestamp) value; + } else { + throw new ConvertNotSupportException(value.getClass().getName(), this.getClass().getName()); + } + return result; + } } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/types/DateTimeType.java b/tikv-client/src/main/java/com/pingcap/tikv/types/DateTimeType.java index 455a034806..5358067bbe 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/types/DateTimeType.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/types/DateTimeType.java @@ -53,21 +53,6 @@ protected Object doConvertToTiDBType(Object value) return convertToMysqlDateTime(value); } - private java.sql.Timestamp convertToMysqlDateTime(Object value) - throws ConvertNotSupportException { - java.sql.Timestamp result; - if (value instanceof String) { - result = java.sql.Timestamp.valueOf((String) value); - } else if (value instanceof java.sql.Date) { - result = new java.sql.Timestamp(((java.sql.Date) value).getTime()); - } else if (value instanceof java.sql.Timestamp) { - result = (java.sql.Timestamp) value; - } else { - throw new ConvertNotSupportException(value.getClass().getName(), this.getClass().getName()); - } - return result; - } - /** * Decode timestamp from packed long value In TiDB / MySQL, timestamp type is converted to UTC and * stored diff --git a/tikv-client/src/main/java/com/pingcap/tikv/types/TimestampType.java b/tikv-client/src/main/java/com/pingcap/tikv/types/TimestampType.java index 9dc78e65b0..6fb0d82a7c 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/types/TimestampType.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/types/TimestampType.java @@ -72,16 +72,9 @@ private java.sql.Timestamp convertToMysqlLocalTimestamp(Object value) if (value instanceof Long) { throw new ConvertNotSupportException(value.getClass().getName(), this.getClass().getName()); // result = new java.sql.Timestamp((Long) value); - } else if (value instanceof String) { - result = java.sql.Timestamp.valueOf((String) value); - } else if (value instanceof java.sql.Date) { - result = new java.sql.Timestamp(((java.sql.Date) value).getTime()); - } else if (value instanceof java.sql.Timestamp) { - result = (java.sql.Timestamp) value; } else { - throw new ConvertNotSupportException(value.getClass().getName(), this.getClass().getName()); + return convertToMysqlDateTime(value); } - return result; } /** From af92ced25f4585f30bf95ee3af5a6af1c45fccc8 Mon Sep 17 00:00:00 2001 From: Zhexuan Yang Date: Tue, 13 Aug 2019 16:50:59 +0800 Subject: [PATCH 34/62] prohibit agg or groupby pushdown on double read (#1004) --- .../com/pingcap/tikv/meta/TiDAGRequest.java | 107 +++++++++++------- 1 file changed, 64 insertions(+), 43 deletions(-) diff --git a/tikv-client/src/main/java/com/pingcap/tikv/meta/TiDAGRequest.java b/tikv-client/src/main/java/com/pingcap/tikv/meta/TiDAGRequest.java index 31f0a777fd..4dfb5045d5 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/meta/TiDAGRequest.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/meta/TiDAGRequest.java @@ -25,6 +25,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.pingcap.tidb.tipb.*; +import com.pingcap.tidb.tipb.DAGRequest.Builder; import com.pingcap.tikv.codec.KeyUtils; import com.pingcap.tikv.exception.DAGRequestException; import com.pingcap.tikv.exception.TiClientInternalException; @@ -462,23 +463,9 @@ else if (col.getColumnInfo().isPrimaryKey() && tableInfo.isPkHandle()) { } if (!getGroupByItems().isEmpty() || !getAggregates().isEmpty()) { + // only allow table scan or covering index scan push down groupby and agg if (!isIndexDoubleScan || (isGroupByCoveredByIndex() && isAggregateCoveredByIndex())) { - Aggregation.Builder aggregationBuilder = Aggregation.newBuilder(); - getGroupByItems() - .forEach( - tiByItem -> - aggregationBuilder.addGroupBy( - ProtoConverter.toProto(tiByItem.getExpr(), colOffsetInFieldMap))); - getAggregates() - .forEach( - tiExpr -> - aggregationBuilder.addAggFunc( - ProtoConverter.toProto(tiExpr, colOffsetInFieldMap))); - executorBuilder.setTp(ExecType.TypeAggregation); - dagRequestBuilder.addExecutors(executorBuilder.setAggregation(aggregationBuilder)); - executorBuilder.clear(); - addPushDownGroupBys(); - addPushDownAggregates(); + pushDownAggAndGroupBy(dagRequestBuilder, executorBuilder, colOffsetInFieldMap); } else { return dagRequestBuilder; } @@ -486,35 +473,68 @@ else if (col.getColumnInfo().isPrimaryKey() && tableInfo.isPkHandle()) { if (!getOrderByItems().isEmpty()) { if (!isIndexDoubleScan || isOrderByCoveredByIndex()) { - TopN.Builder topNBuilder = TopN.newBuilder(); - getOrderByItems() - .forEach( - tiByItem -> - topNBuilder.addOrderBy( - com.pingcap.tidb.tipb.ByItem.newBuilder() - .setExpr( - ProtoConverter.toProto(tiByItem.getExpr(), colOffsetInFieldMap)) - .setDesc(tiByItem.isDesc()))); - executorBuilder.setTp(ExecType.TypeTopN); - topNBuilder.setLimit(getLimit()); - dagRequestBuilder.addExecutors(executorBuilder.setTopN(topNBuilder)); - executorBuilder.clear(); - addPushDownOrderBys(); + // only allow table scan or covering index scan push down orderby + pushDownOrderBy(dagRequestBuilder, executorBuilder, colOffsetInFieldMap); } } else if (getLimit() != 0) { if (!isIndexDoubleScan) { - Limit.Builder limitBuilder = Limit.newBuilder(); - limitBuilder.setLimit(getLimit()); - executorBuilder.setTp(ExecType.TypeLimit); - dagRequestBuilder.addExecutors(executorBuilder.setLimit(limitBuilder)); - executorBuilder.clear(); - addPushDownLimits(); + pushDownLimit(dagRequestBuilder, executorBuilder); } } return dagRequestBuilder; } + private void pushDownLimit( + DAGRequest.Builder dagRequestBuilder, Executor.Builder executorBuilder) { + Limit.Builder limitBuilder = Limit.newBuilder(); + limitBuilder.setLimit(getLimit()); + executorBuilder.setTp(ExecType.TypeLimit); + dagRequestBuilder.addExecutors(executorBuilder.setLimit(limitBuilder)); + executorBuilder.clear(); + addPushDownLimits(); + } + + private void pushDownOrderBy( + DAGRequest.Builder dagRequestBuilder, + Executor.Builder executorBuilder, + Map colOffsetInFieldMap) { + TopN.Builder topNBuilder = TopN.newBuilder(); + getOrderByItems() + .forEach( + tiByItem -> + topNBuilder.addOrderBy( + com.pingcap.tidb.tipb.ByItem.newBuilder() + .setExpr(ProtoConverter.toProto(tiByItem.getExpr(), colOffsetInFieldMap)) + .setDesc(tiByItem.isDesc()))); + executorBuilder.setTp(ExecType.TypeTopN); + topNBuilder.setLimit(getLimit()); + dagRequestBuilder.addExecutors(executorBuilder.setTopN(topNBuilder)); + executorBuilder.clear(); + addPushDownOrderBys(); + } + + private void pushDownAggAndGroupBy( + DAGRequest.Builder dagRequestBuilder, + Executor.Builder executorBuilder, + Map colOffsetInFieldMap) { + Aggregation.Builder aggregationBuilder = Aggregation.newBuilder(); + getGroupByItems() + .forEach( + tiByItem -> + aggregationBuilder.addGroupBy( + ProtoConverter.toProto(tiByItem.getExpr(), colOffsetInFieldMap))); + getAggregates() + .forEach( + tiExpr -> + aggregationBuilder.addAggFunc(ProtoConverter.toProto(tiExpr, colOffsetInFieldMap))); + executorBuilder.setTp(ExecType.TypeAggregation); + dagRequestBuilder.addExecutors(executorBuilder.setAggregation(aggregationBuilder)); + executorBuilder.clear(); + addPushDownGroupBys(); + addPushDownAggregates(); + } + private boolean isExpressionCoveredByIndex(Expression expr) { Set indexColumnRefSet = indexInfo @@ -523,10 +543,11 @@ private boolean isExpressionCoveredByIndex(Expression expr) { .filter(x -> !x.isPrefixIndex()) .map(TiIndexColumn::getName) .collect(Collectors.toSet()); - return PredicateUtils.extractColumnRefFromExpression(expr) - .stream() - .map(ColumnRef::getName) - .allMatch(indexColumnRefSet::contains); + return !isDoubleRead() + && PredicateUtils.extractColumnRefFromExpression(expr) + .stream() + .map(ColumnRef::getName) + .allMatch(indexColumnRefSet::contains); } private boolean isGroupByCoveredByIndex() { @@ -648,7 +669,7 @@ int getTimeZoneOffset() { * @param mode truncate mode * @return a TiDAGRequest */ - public TiDAGRequest setTruncateMode(TiDAGRequest.TruncateMode mode) { + TiDAGRequest setTruncateMode(TiDAGRequest.TruncateMode mode) { flags = requireNonNull(mode, "mode is null").mask(flags); return this; } @@ -700,7 +721,7 @@ public TiDAGRequest addAggregate(Expression expr, DataType targetType) { return this; } - public List getAggregates() { + List getAggregates() { return aggregates.stream().map(p -> p.first).collect(Collectors.toList()); } @@ -909,7 +930,7 @@ public void setIsDoubleRead(boolean isDoubleRead) { * * @return boolean */ - public boolean isCoveringIndexScan() { + private boolean isCoveringIndexScan() { return hasIndex() && !isDoubleRead(); } From 56c344132c49deed9d1423144be4758372ee9018 Mon Sep 17 00:00:00 2001 From: Zhexuan Yang Date: Tue, 13 Aug 2019 17:09:42 +0800 Subject: [PATCH 35/62] remove split region code (#1015) --- .../tikv/region/RegionStoreClient.java | 40 ------------------- 1 file changed, 40 deletions(-) diff --git a/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java b/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java index 272482fec8..9f17955679 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java @@ -767,44 +767,4 @@ public void onStoreNotMatch(Store store) { + addressStr); } } - - /** - * Send SplitRegion request to tikv split a region at splitKey. splitKey must between current - * region's start key and end key. - * - * @param splitKey is the split point for a specific region. - * @return a split region info. - */ - public TiRegion splitRegion(ByteString splitKey) { - Supplier request = - () -> - SplitRegionRequest.newBuilder() - .setContext( - Context.newBuilder() - .setRegionId(region.getId()) - .setRegionEpoch(region.getRegionEpoch()) - .setPeer(region.getLeader()) - .build()) - .setSplitKey(splitKey) - .build(); - - KVErrorHandler handler = - new KVErrorHandler<>( - regionManager, - this, - region, - resp -> resp.hasRegionError() ? resp.getRegionError() : null); - - SplitRegionResponse resp = - callWithRetry( - ConcreteBackOffer.newGetBackOff(), TikvGrpc.METHOD_SPLIT_REGION, request, handler); - if (resp.hasRegionError()) { - throw new TiClientInternalException( - String.format( - "failed to split region %d at key %s because %s", - region.getId(), splitKey.toString(), resp.getRegionError().toString())); - } - - return new TiRegion(resp.getLeft(), null, conf.getIsolationLevel(), conf.getCommandPriority()); - } } From ab9aeea1bc1c10c37042f9929b7f62a34f186072 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Tue, 13 Aug 2019 17:19:38 +0800 Subject: [PATCH 36/62] add supported scala version (#1013) --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index a09896c122..9392d8c425 100755 --- a/README.md +++ b/README.md @@ -85,6 +85,9 @@ While TiSpark provides downward compatibility for TiDB, it guarantees **restrict ## How to migrate from Spark 2.1 to Spark 2.3/2.4 For users using Spark 2.1 who wish to migrate to latest TiSpark on Spark 2.3/2.4, please download or install Spark 2.3+/2.4+ following instructions on [Apache Spark Site](http://spark.apache.org/downloads.html) and overwrite the old spark version in `$SPARK_HOME`. +## Scala Version +TiSpark currently only supports `scala-2.11`. + ## TiSpark Architecture ![architecture](./docs/architecture.png) From a42bc883ef88073929e15a2cc33748c266435772 Mon Sep 17 00:00:00 2001 From: birdstorm Date: Tue, 13 Aug 2019 17:49:07 +0800 Subject: [PATCH 37/62] Fix scala compiler version (#1010) --- core/pom.xml | 2 +- pom.xml | 2 +- spark-wrapper/spark-2.3/pom.xml | 2 +- spark-wrapper/spark-2.4/pom.xml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/core/pom.xml b/core/pom.xml index d384f2d06c..ab8b711daa 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -261,7 +261,7 @@ org.antipathy - mvn-scalafmt_${scala.version} + mvn-scalafmt_${scala.binary.version} 0.10_1.5.1 diff --git a/pom.xml b/pom.xml index b113c1d930..66ea97bbde 100644 --- a/pom.xml +++ b/pom.xml @@ -67,7 +67,7 @@ 2.3.3 2.4.3 2.11 - 2.11 + 2.11.12 3.0.4 -Dfile.encoding=UTF-8 -Duser.timezone=GMT+8 5.1.44 diff --git a/spark-wrapper/spark-2.3/pom.xml b/spark-wrapper/spark-2.3/pom.xml index be9401c59e..fa12d08fe5 100644 --- a/spark-wrapper/spark-2.3/pom.xml +++ b/spark-wrapper/spark-2.3/pom.xml @@ -101,7 +101,7 @@ org.antipathy - mvn-scalafmt_${scala.version} + mvn-scalafmt_${scala.binary.version} 0.10_1.5.1 diff --git a/spark-wrapper/spark-2.4/pom.xml b/spark-wrapper/spark-2.4/pom.xml index e3dc0830b7..47748fc058 100644 --- a/spark-wrapper/spark-2.4/pom.xml +++ b/spark-wrapper/spark-2.4/pom.xml @@ -101,7 +101,7 @@ org.antipathy - mvn-scalafmt_${scala.version} + mvn-scalafmt_${scala.binary.version} 0.10_1.5.1 From 4f8a6d39b7b5d2d121615e97dc8e04944ced63ad Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Wed, 14 Aug 2019 14:51:25 +0800 Subject: [PATCH 38/62] fix reflection bug for hdp release (#1017) (#1018) (cherry picked from commit 118b12ea6051fa1c21db57f1eb30dc21cd420125) --- .../tispark/utils/ReflectionUtil.scala | 64 +++++++++++++------ 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/core/src/main/scala/com/pingcap/tispark/utils/ReflectionUtil.scala b/core/src/main/scala/com/pingcap/tispark/utils/ReflectionUtil.scala index 0a3a0dd6c6..9e4522cd57 100644 --- a/core/src/main/scala/com/pingcap/tispark/utils/ReflectionUtil.scala +++ b/core/src/main/scala/com/pingcap/tispark/utils/ReflectionUtil.scala @@ -59,33 +59,59 @@ object ReflectionUtil { // isOrderSensitive: Boolean = false): RDD[U] // // Hereby we use reflection to support different Spark versions. - private val mapPartitionsWithIndexInternal: Method = - TiSparkInfo.SPARK_VERSION match { - case "2.3.0" | "2.3.1" => - classOf[RDD[InternalRow]].getDeclaredMethod( - "mapPartitionsWithIndexInternal", - classOf[(Int, Iterator[InternalRow]) => Iterator[UnsafeRow]], - classOf[Boolean], - classOf[ClassTag[UnsafeRow]] - ) - case _ => - // Spark version >= 2.3.2 + private val mapPartitionsWithIndexInternal: Method = TiSparkInfo.SPARK_VERSION match { + case "2.3.0" | "2.3.1" => + tryLoadMethod( + "mapPartitionsWithIndexInternal", + mapPartitionsWithIndexInternalV1, + mapPartitionsWithIndexInternalV2 + ) + case _ => + // Spark version >= 2.3.2 + tryLoadMethod( + "mapPartitionsWithIndexInternal", + mapPartitionsWithIndexInternalV2, + mapPartitionsWithIndexInternalV1 + ) + } + + // Spark HDP Release may not compatible with official Release + // see https://github.com/pingcap/tispark/issues/1006 + private def tryLoadMethod(name: String, f1: () => Method, f2: () => Method): Method = { + try { + f1.apply() + } catch { + case _: Throwable => try { - classOf[RDD[InternalRow]].getDeclaredMethod( - "mapPartitionsWithIndexInternal", - classOf[(Int, Iterator[InternalRow]) => Iterator[UnsafeRow]], - classOf[Boolean], - classOf[Boolean], - classOf[ClassTag[UnsafeRow]] - ) + f2.apply() } catch { case _: Throwable => throw ScalaReflectionException( - "Cannot find reflection of Method mapPartitionsWithIndexInternal, current Spark version is %s" + s"Cannot find reflection of Method $name, current Spark version is %s" .format(TiSparkInfo.SPARK_VERSION) ) } } + } + + // Spark-2.3.0 & Spark-2.3.1 + private def mapPartitionsWithIndexInternalV1(): Method = + classOf[RDD[InternalRow]].getDeclaredMethod( + "mapPartitionsWithIndexInternal", + classOf[(Int, Iterator[InternalRow]) => Iterator[UnsafeRow]], + classOf[Boolean], + classOf[ClassTag[UnsafeRow]] + ) + + // >= Spark-2.3.2 + private def mapPartitionsWithIndexInternalV2(): Method = + classOf[RDD[InternalRow]].getDeclaredMethod( + "mapPartitionsWithIndexInternal", + classOf[(Int, Iterator[InternalRow]) => Iterator[UnsafeRow]], + classOf[Boolean], + classOf[Boolean], + classOf[ClassTag[UnsafeRow]] + ) case class ReflectionMapPartitionWithIndexInternal( rdd: RDD[InternalRow], From f90f961bce4b0285b21fd49e3f2cf949fb033cd6 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Thu, 15 Aug 2019 17:52:01 +0800 Subject: [PATCH 39/62] check by grammarly (#1022) --- README.md | 31 ++++++----- docs/datasource_api_userguide.md | 16 +++--- docs/how_to_use_tidb_as_metastore_db.md | 18 +++---- docs/userguide.md | 50 +++++++++--------- docs/userguide_spark2.1.md | 68 ++++++++++++------------- 5 files changed, 90 insertions(+), 93 deletions(-) diff --git a/README.md b/README.md index 9392d8c425..65c77f1c6c 100755 --- a/README.md +++ b/README.md @@ -41,13 +41,13 @@ TiSpark now supports Spark 2.3.0+/2.4.0+. The previous version for Spark 2.1.0+ ``` git clone https://github.com/pingcap/tispark.git ``` -To build all TiSpark modules from sources, please run command under TiSpark root directory: +To build all TiSpark modules from sources, please run the command under TiSpark root directory: ``` mvn clean install -Dmaven.test.skip=true -P spark-2.3 or mvn clean install -Dmaven.test.skip=true -P spark-2.4 ``` -**Please note that after you need to specify major version of Spark according to the Spark version you are using.** +**Please note that you need to specify the major version of Spark according to the Spark version you are using.** Remember to add `-Dmaven.test.skip=true` to skip all the tests if you don't need to run them. @@ -62,7 +62,7 @@ Remember to add `-Dmaven.test.skip=true` to skip all the tests if you don't need ## Maximum TiDB/TiKV/PD version supported by TiSpark -Each latest TiSpark version guarantees *backward compatibility* for TiDB components, i.e., supports TiDB/TiKV/PD until a certain release. Its reason varies, amongst which the most common one is that the new features and bug-fixes provided by TiDB components requires update on API, dependencies, etc. +Each latest TiSpark version guarantees *backward compatibility* for TiDB components, i.e., supports TiDB/TiKV/PD until a certain release. Its reason varies, amongst which the most common one is that the new features and bug-fixes provided by TiDB components require an update on API, dependencies, etc. | TiSpark Version | Maximum TiDB Version | Maximum TiKV Version | Maximum PD Version | | ----- | ------ | ------ | ------ | @@ -93,7 +93,7 @@ TiSpark currently only supports `scala-2.11`. ![architecture](./docs/architecture.png) -- TiSpark integrates with Spark Catalyst Engine deeply. It provides precise control of the computing, which allows Spark read data from TiKV efficiently. It also supports index seek, which improves the performance of the point query execution significantly. +- TiSpark integrates with Spark Catalyst Engine deeply. It provides precise control of computing, which allows Spark to read data from TiKV efficiently. It also supports index seek, which improves the performance of the point query execution significantly. - It utilizes several strategies to push down the computing to reduce the size of dataset handling by Spark SQL, which accelerates the query execution. It also uses the TiDB built-in statistical information for the query plan optimization. @@ -101,7 +101,7 @@ TiSpark currently only supports `scala-2.11`. - In addition, you can deploy and utilize tools from the Spark ecosystem for further data processing and manipulation on TiDB. For example, using TiSpark for data analysis and ETL; retrieving data from TiKV as a machine learning data source; generating reports from the scheduling system and so on. -TiSpark depends on the existence of TiKV clusters and PDs. It also needs to setup and use Spark clustering platform. +TiSpark depends on the existence of TiKV clusters and PDs. It also needs to set up and use Spark clustering platform. A thin layer of TiSpark. Most of the logic is inside tikv-client library. https://github.com/pingcap/tispark/tree/master/tikv-client @@ -137,7 +137,7 @@ spark.sql("select count(*) from lineitem").show spark.sql("select ti_version()").show ``` -## TiDB Data Source API +## TiDB Data Source API When using the TiDB Data Source API, please follow the document for [TiDB Data Source API User Guide](./docs/datasource_api_userguide.md). ## Configuration @@ -157,11 +157,11 @@ Below configurations can be put together with spark-defaults.conf or passed in t | spark.tispark.table.scan_concurrency | 512 | Maximal threads for table scan (shared among tasks inside each JVM) | | spark.tispark.request.command.priority | "Low" | "Low", "Normal", "High" which impacts resource to get in TiKV. Low is recommended for not disturbing OLTP workload | | spark.tispark.coprocess.streaming | false | Whether to use streaming for response fetching (Experimental) | -| spark.tispark.plan.unsupported_pushdown_exprs | "" | A comma separated list of expressions. In case you have very old version of TiKV, you might disable some of the expression push-down if not supported | -| spark.tispark.plan.downgrade.index_threshold | 1000000000 | If index scan ranges on one region exceeds this limit in original request, downgrade this region's request to table scan rather than original planned index scan, by default the downgrade is turned off | +| spark.tispark.plan.unsupported_pushdown_exprs | "" | A comma-separated list of expressions. In case you have a very old version of TiKV, you might disable some of the expression push-down if not supported | +| spark.tispark.plan.downgrade.index_threshold | 1000000000 | If index scan ranges on one region exceed this limit in the original request, downgrade this region's request to table scan rather than original planned index scan, by default the downgrade is turned off | | spark.tispark.show_rowid | false | If to show implicit row Id if exists | | spark.tispark.db_prefix | "" | A string indicating the extra database prefix for all databases in TiDB to distinguish them from Hive databases with the same name | -| spark.tispark.request.isolation.level | "SI" | Isolation level means whether do the resolve lock for the underlying tidb clusters. When you use the "RC", you will get the newest version of record smaller than your tso and ignore the locks. And if you use "SI", you will resolve the locks and get the records according whether resolved lock is committed or aborted | +| spark.tispark.request.isolation.level | "SI" | Isolation level means whether do the resolve lock for the underlying tidb clusters. When you use the "RC", you will get the newest version of record smaller than your tso and ignore the locks. And if you use "SI", you will resolve the locks and get the records according to whether the resolved lock is committed or aborted | ## Log4j Configuration When you start `spark-shell` or `spark-sql` and run `show databases`, you might see the following warnings: @@ -184,23 +184,23 @@ If you want to know how TiSpark could benefit from TiDB's statistic information, ## Compatibility with tidb-3.0 ### View -TiDB starts to support `view` from `tidb-3.0`. +TiDB starts to support `view` from `tidb-3.0`. TiSpark currently **does not support** `view`. Users will not be able to observe or access data through views with TiSpark. ### Table Partition -`tidb-3.0` supports both `Range Partition` and `Hash Partition`. +`tidb-3.0` supports both `Range Partition` and `Hash Partition`. -TiSpark currently **supports** `Range Partition` and `Hash Partition`. Users can select data from `Range Partition` table and `Hash Partition` table through TiSpark. +TiSpark currently **supports** `Range Partition` and `Hash Partition`. Users can select data from `Range Partition` table and `Hash Partition` table through TiSpark. -In most cases TiSpark will use full table scan. Only in some cases TiSpark will apply partition pruning (read more [here](./docs/userguide.md). +In most cases, TiSpark will use a full table scan. Only in some cases, TiSpark will apply partition pruning (read more [here](./docs/userguide.md). ## How to test We use [docker-compose](https://docs.docker.com/compose/) to provide tidb cluster service which allows you to run test across different platforms. It is recommended to install docker in order to test locally, or you can set up your own TiDB cluster locally as you wish. -If you prefer the docker way, you can use `docker-compose up -d` to launch tidb cluster service under tispark home directory. If you want to see tidb cluster's log you can launch via `docker-compose up`. You can use `docker-compose down` to shutdown entire tidb cluster service. All data is stored in `data` directory at the root of this project. Feel free to change it. +If you prefer the docker way, you can use `docker-compose up -d` to launch tidb cluster service under tispark home directory. If you want to see tidb cluster's log you can launch via `docker-compose up`. You can use `docker-compose down` to shutdown entire tidb cluster service. All data is stored in the `data` directory at the root of this project. Feel free to change it. -You can read more about test [here](./core/src/test/Readme.md). +You can read more about the test [here](./core/src/test/Readme.md). ## Follow us @@ -216,4 +216,3 @@ tidb-user@googlegroups.com ## License TiSpark is under the Apache 2.0 license. See the [LICENSE](./LICENSE) file for details. - diff --git a/docs/datasource_api_userguide.md b/docs/datasource_api_userguide.md index 3515a506c6..cf95191b13 100644 --- a/docs/datasource_api_userguide.md +++ b/docs/datasource_api_userguide.md @@ -18,28 +18,28 @@ Query pushdown leverages these performance efficiencies by enabling large and co Pushdown is not possible in all situations. For example, Spark UDFs cannot be pushed down to TiKV. ## Transaction support for Write -Since TiDB is a database that supports transaction, TiDB Spark Connector also support transaction, which means: -1. all data in DataFrame will be written to TiDB successfully, if no conflicts exist -2. no data in DataFrame will be written to TiDB successfully, if conflicts exist -3. no partial changes is visible to other session until commit. +Since TiDB is a database that supports `transaction`, TiDB Spark Connector also supports `transaction`, which means: +1. all data in DataFrame will be written to TiDB successfully if no conflicts exist +2. no data in DataFrame will be written to TiDB successfully if conflicts exist +3. no partial changes are visible to other sessions until commit. ## Replace and insert semantics TiSpark only supports `Append` SaveMode. The behavior is controlled by `replace` option. The default value is false. In addition, if `replace` is true, -data to be inserted will be deduplicate before insertion. +data to be inserted will be deduplicated before insertion. If `replace` is true, then -* if primary key or unique index exists in db, data will be updated +* if the primary key or unique index exists in DB, data will be updated * if no same primary key or unique index exists, data will be inserted. If `replace` is false, then -* if primary key or unique index exists in db, data having conflicts expects an exception. +* if the primary key or unique index exists in DB, data having conflicts expects an exception. * if no same primary key or unique index exists, data will be inserted. ## Using the Spark Connector With Extensions Enabled The connector adheres to the standard Spark API, but with the addition of TiDB-specific options. -The connector can be used both with or without extensions enabled. Here's examples about how to use it with extensions. +The connector can be used both with or without extensions enabled. Here are examples about how to use it with extensions. See [code examples with extensions](https://github.com/pingcap/tispark-test/blob/master/tispark-examples/src/main/scala/com/pingcap/tispark/examples/TiDataSourceExampleWithExtensions.scala). diff --git a/docs/how_to_use_tidb_as_metastore_db.md b/docs/how_to_use_tidb_as_metastore_db.md index 8333aac9ed..7b29d7ec86 100644 --- a/docs/how_to_use_tidb_as_metastore_db.md +++ b/docs/how_to_use_tidb_as_metastore_db.md @@ -1,8 +1,8 @@ -# Setting TiDB as metastore db +# Setting TiDB as metastore DB -From time to time, users may need run multiple `spark-shell`s at same directory which often leads to some +From time to time, users may need to run multiple `spark-shell`s at the same directory which often leads to some exceptions. Exceptions caused by lock conflicts: you already have a spark-shell running which blocks you run another spark-shell -at same directory. The way to address this need is setting tidb up as metastore db. +at same directory. The way to address this need is setting tidb up as metastore DB. ## Setup TiDB @@ -14,12 +14,12 @@ GRANT ALL PRIVILEGES ON metastore_db.* TO 'hive'@'%'; FLUSH PRIVILEGES; ``` -Above SQLs help you create a user and grant access privileges to tables under `metastore_db`. +Above SQLs help you create a user and grant access privileges to tables under `metastore_db`. ### When you rely on spark itself to initialize metastore -This is actually very dangerous and not recommended. If you rely on spark itself to initialize metastore, -please do following: +This is very dangerous and not recommended. If you rely on spark itself to initialize metastore, +please do the following: 1. Make sure there is no existing metastore. If so, please use official spark schema tools to upgrade or migrate. 2. Fill in root account in hive-site.xml. Let spark use root account to create metastore tables. 3. Then switch back to a normal account without any create table and alter table privileges. @@ -29,9 +29,7 @@ This preventing unexpected schema corruption when code changes. ## Adding hive-site.xml configuration to Spark -Then you can find a sample conf file [hive-site.xml.template](../config/hive-site.xml.template) and +Then you can find a sample conf file [hive-site.xml.template](../config/hive-site.xml.template) and adjust some settings. You also need put the file into `SPARK_HOME/conf`. -After you finish these two steps, you are able to use tidb to store meta info of Spark. - - +After you finish these two steps, you can use tidb to store meta info of Spark. diff --git a/docs/userguide.md b/docs/userguide.md index 43492fe3f8..987778b773 100644 --- a/docs/userguide.md +++ b/docs/userguide.md @@ -5,7 +5,7 @@ TiSpark is a thin layer built for running Apache Spark on top of TiDB/TiKV to answer the complex OLAP queries. It takes advantages of both the Spark platform and the distributed TiKV cluster, at the same time, seamlessly glues to TiDB, the distributed OLTP database, to provide a Hybrid Transactional/Analytical Processing (HTAP) solution to serve as a one-stop solution for online transactions and analysis. -TiSpark depends on the TiKV cluster and the PD cluster. It also needs to set up a Spark cluster. This document provides a brief introduction to how to setup and use TiSpark. It requires some basic knowledge of Apache Spark. For more information, please refer to [Spark website](https://spark.apache.org/docs/latest/index.html). +TiSpark depends on the TiKV cluster and the PD cluster. It also needs to set up a Spark cluster. This document provides a brief introduction to how to set up and use TiSpark. It requires some basic knowledge of Apache Spark. For more information, please refer to [Spark website](https://spark.apache.org/docs/latest/index.html). ## Overview @@ -17,10 +17,10 @@ TiSpark is an OLAP solution that runs Spark SQL directly on TiKV, the distribute TiSpark Architecture -+ TiSpark integrates with Spark Catalyst Engine deeply. It provides precise control of the computing, which allows Spark read data from TiKV efficiently. It also supports index seek, which improves the performance of the point query execution significantly. ++ TiSpark integrates with Spark Catalyst Engine deeply. It provides precise control of computing, which allows Spark to read data from TiKV efficiently. It also supports index seek, which improves the performance of the point query execution significantly. + It utilizes several strategies to push down the computing to reduce the size of dataset handling by Spark SQL, which accelerates the query execution. It also uses the TiDB built-in statistical information for the query plan optimization. + From the data integration point of view, TiSpark + TiDB provides a solution runs both transaction and analysis directly on the same platform without building and maintaining any ETLs. It simplifies the system architecture and reduces the cost of maintenance. -+ In addition, you can deploy and utilize tools from the Spark ecosystem for further data processing and manipulation on TiDB. For example, using TiSpark for data analysis and ETL; retrieving data from TiKV as a machine learning data source; generating reports from the scheduling system and so on. ++ Also, you can deploy and utilize tools from the Spark ecosystem for further data processing and manipulation on TiDB. For example, using TiSpark for data analysis and ETL; retrieving data from TiKV as a machine learning data source; generating reports from the scheduling system and so on. ## Environment Setup @@ -36,7 +36,7 @@ TiSpark Architecture #### Configuration of the TiKV cluster For independent deployment of TiKV and TiSpark, it is recommended to refer to the following recommendations - + + Hardware configuration - For general purposes, please refer to the TiDB and TiKV hardware configuration [recommendations](https://github.com/pingcap/docs/blob/master/op-guide/recommendation.md#deployment-recommendations). - If the usage is more focused on the analysis scenarios, you can increase the memory of the TiKV nodes to at least 64G. If using Hard Disk Drive (HDD), it is recommended to use at least 8 disks. @@ -70,9 +70,9 @@ Block-cache-size = "1GB" Scheduler-worker-pool-size = 4 ``` -#### Configuration of the independent deployment of the Spark cluster and the TiSpark cluster +#### Configuration of the independent deployment of the Spark cluster and the TiSpark cluster + - Please refer to the [Spark official website](https://spark.apache.org/docs/latest/hardware-provisioning.html) for the detail hardware recommendations. The following is a short overview of the TiSpark configuration. @@ -101,7 +101,7 @@ For example, `10.16.20.1:2379,10.16.20.2:2379,10.16.20.3:2379` when you have mul #### Hybrid deployment configuration for the TiSpark and TiKV cluster For the hybrid deployment of TiSpark and TiKV, add the TiSpark required resources to the TiKV reserved resources, and allocate 25% of the memory for the system. - + ## Deploy TiSpark @@ -115,7 +115,7 @@ Running TiSpark on an existing Spark cluster does not require a reboot of the cl spark-shell --jars $your_path_to/tispark-${name_with_version}.jar ``` -If you want to deploy TiSpark as a default component, simply place the TiSpark jar package into the jars path for each node of the Spark cluster and restart the Spark cluster: +If you want to deploy TiSpark as a default component, simply place the TiSpark jar package into the path of the jar for each node of the Spark cluster and restart the Spark cluster: ``` cp $your_path_to/tispark-${name_with_version}.jar $SPARK_HOME/jars @@ -140,12 +140,12 @@ If you need to use the Hadoop cluster, please choose the corresponding Hadoop ve **Please confirm the Spark version your TiSpark version supports.** -Suppose you already have a Spark binaries, and the current PATH is `SPARKPATH`, please copy the TiSpark jar package to the `$SPARKPATH/jars` directory. +Suppose you already have a Spark binary, and the current PATH is `SPARKPATH`, please copy the TiSpark jar package to the `$SPARKPATH/jars` directory. #### Starting a Master node Execute the following command on the selected Spark Master node: - + ``` cd $SPARKPATH @@ -163,7 +163,7 @@ Similarly, you can start a Spark-Slave node with the following command: ./sbin/start-slave.sh spark://spark-master-hostname:7077 ``` -After the command returns, you can see if the Slave node is joined to the Spark cluster correctly from the panel as well. Repeat the above command at all Slave nodes. After all Slaves are connected to the master, you have a Standalone mode Spark cluster. +After the command returns, you can see if the Slave node is joined to the Spark cluster correctly from the panel as well. Repeat the above command at all Slave nodes. After all the slaves are connected to the master, you have a Standalone mode Spark cluster. #### Spark SQL shell and JDBC Server @@ -174,7 +174,7 @@ Now that TiSpark supports Spark 2.3/2.4, you can use Spark's ThriftServer and Sp Assuming you have successfully started the TiSpark cluster as described above, here's a quick introduction to how to use Spark SQL for OLAP analysis. Here we use a table named `lineitem` in the `tpch` database as an example. -Add +Add ``` spark.tispark.pd.addresses 192.168.1.100:2379 spark.sql.extensions org.apache.spark.sql.TiExtensions @@ -200,7 +200,7 @@ The result is: | 600000000 | +-------------+ ``` - + TiSpark's SQL Interactive shell is almost the same as spark-sql shell. ``` @@ -240,11 +240,11 @@ TiSparkR is a thin layer built for supporting R language with TiSpark Refer to [this document](../R/README.md) for usage. ## TiSpark on PySpark -TiSpark on PySpark is a Python package build to support the Python language with TiSpark. +TiSpark on PySpark is a Python package build to support the Python language with TiSpark. Refer to [this document](../python/README.md) for usage. ## Use TiSpark together with Hive -TiSpark should be ok to use together with Hive. +TiSpark should be ok to use together with Hive. You need to set environment variable HADOOP_CONF_DIR to your Hadoop's configuration folder and copy hive-site.xml to spark/conf folder before starting Spark. ``` val tisparkDF = spark.sql("select * from tispark_table").toDF @@ -274,39 +274,39 @@ df.write .option("isolationLevel", "NONE") // recommended to set isolationLevel to NONE if you have a large DF to load. .option("user", "root") // TiDB user here .save() -``` +``` It is recommended to set `isolationLevel` to `NONE` to avoid large single transactions which may potentially lead to TiDB OOM. ## Statistics information -TiSpark could use TiDB's statistic information for +TiSpark could use TiDB's statistic information for 1. Determining which index to ues in your query plan with the estimated lowest cost. 2. Small table broadcasting, which enables efficient broadcast join. If you would like TiSpark to use statistic information, first you need to make sure that concerning tables have already been analyzed. Read more about how to analyze tables [here](https://github.com/pingcap/docs/blob/master/sql/statistics.md). -Since TiSpark 2.0, statistics information will be default to auto load. +Since TiSpark 2.0, statistics information will be default to autoload. Note that table statistics will be cached in your spark driver node's memory, so you need to make sure that your memory should be enough for your statistics information. -Currently you could adjust these configs in your spark.conf file. - +Currently, you could adjust these configs in your spark.conf file. + | Property Name | Default | Description | -------- | -----: | :----: | | spark.tispark.statistics.auto_load | true | Whether to load statistics info automatically during database mapping. | ## Reading partition table from TiDB -TiSpark can read range and hash partition table from TiDB. +TiSpark can read range and hash partition table from TiDB. TiSpark decides whether to apply partition pruning according to the type of partition and the partition expression associated with the table. -Currently, TiSpark can partially apply partition pruning on range partition. +Currently, TiSpark can partially apply partition pruning on range partition. The partition pruning can be applied when the partition expression of the range partition is one of the following: * column expression -* year(expr) where expr is a column and its type is datetime or string literal +* year(expr) where expr is a column and its type is datetime or string literal but can be parsed as datetime. -If partition pruning cannot be applied, it is equivalent to doing a table scan over all partitions. +If partition pruning cannot be applied, it is equivalent to doing a table scan over all partitions. ## Common Port numbers used by Spark Cluster @@ -326,7 +326,7 @@ If partition pruning cannot be applied, it is equivalent to doing a table scan o Q: What are the pros/cons of independent deployment as opposed to a shared resource with an existing Spark / Hadoop cluster? -A: You can use the existing Spark cluster without a separate deployment, but if the existing cluster is busy, TiSpark will not be able to achieve the desired speed. +A: You can use the existing Spark cluster without a separate deployment, but if the existing cluster is busy, TiSpark will not be able to achieve the desired speed. Q: Can I mix Spark with TiKV? diff --git a/docs/userguide_spark2.1.md b/docs/userguide_spark2.1.md index 9be6de4b72..b730c1644e 100644 --- a/docs/userguide_spark2.1.md +++ b/docs/userguide_spark2.1.md @@ -5,7 +5,7 @@ TiSpark is a thin layer built for running Apache Spark on top of TiDB/TiKV to answer the complex OLAP queries. It takes advantages of both the Spark platform and the distributed TiKV cluster, at the same time, seamlessly glues to TiDB, the distributed OLTP database, to provide a Hybrid Transactional/Analytical Processing (HTAP) solution to serve as a one-stop solution for online transactions and analysis. -TiSpark depends on the TiKV cluster and the PD cluster. It also needs to set up a Spark cluster. This document provides a brief introduction to how to setup and use TiSpark. It requires some basic knowledge of Apache Spark. For more information, please refer to [Spark website](https://spark.apache.org/docs/latest/index.html). +TiSpark depends on the TiKV cluster and the PD cluster. It also needs to set up a Spark cluster. This document provides a brief introduction to how to set up and use TiSpark. It requires some basic knowledge of Apache Spark. For more information, please refer to [Spark website](https://spark.apache.org/docs/latest/index.html). ## Overview @@ -17,10 +17,10 @@ TiSpark is an OLAP solution that runs Spark SQL directly on TiKV, the distribute TiSpark Architecture -+ TiSpark integrates with Spark Catalyst Engine deeply. It provides precise control of the computing, which allows Spark read data from TiKV efficiently. It also supports index seek, which improves the performance of the point query execution significantly. ++ TiSpark integrates with Spark Catalyst Engine deeply. It provides precise control of computing, which allows Spark to read data from TiKV efficiently. It also supports index seek, which improves the performance of the point query execution significantly. + It utilizes several strategies to push down the computing to reduce the size of dataset handling by Spark SQL, which accelerates the query execution. It also uses the TiDB built-in statistical information for the query plan optimization. + From the data integration point of view, TiSpark + TiDB provides a solution runs both transaction and analysis directly on the same platform without building and maintaining any ETLs. It simplifies the system architecture and reduces the cost of maintenance. -+ In addition, you can deploy and utilize tools from the Spark ecosystem for further data processing and manipulation on TiDB. For example, using TiSpark for data analysis and ETL; retrieving data from TiKV as a machine learning data source; generating reports from the scheduling system and so on. ++ Also, you can deploy and utilize tools from the Spark ecosystem for further data processing and manipulation on TiDB. For example, using TiSpark for data analysis and ETL; retrieving data from TiKV as a machine learning data source; generating reports from the scheduling system and so on. ## Environment Setup @@ -36,7 +36,7 @@ TiSpark Architecture #### Configuration of the TiKV cluster For independent deployment of TiKV and TiSpark, it is recommended to refer to the following recommendations - + + Hardware configuration - For general purposes, please refer to the TiDB and TiKV hardware configuration [recommendations](https://github.com/pingcap/docs/blob/master/op-guide/recommendation.md#deployment-recommendations). - If the usage is more focused on the analysis scenarios, you can increase the memory of the TiKV nodes to at least 64G. If using Hard Disk Drive (HDD), it is recommended to use at least 8 disks. @@ -70,9 +70,9 @@ Block-cache-size = "1GB" Scheduler-worker-pool-size = 4 ``` -#### Configuration of the independent deployment of the Spark cluster and the TiSpark cluster +#### Configuration of the independent deployment of the Spark cluster and the TiSpark cluster + - Please refer to the [Spark official website](https://spark.apache.org/docs/latest/hardware-provisioning.html) for the detail hardware recommendations. The following is a short overview of the TiSpark configuration. @@ -81,7 +81,7 @@ Generally, it is recommended to allocate 32G memory for Spark. Please reserve at It is recommended to provision at least 8 to 16 cores on per machine for Spark. Initially, you can assign all the CPU cores to Spark. -Please refer to the Spark official configuration website at (https://spark.apache.org/docs/latest/spark-standalone.html). The following is an example based on the spark-env.sh configuration: +Please refer to the Spark official configuration website at (https://spark.apache.org/docs/latest/spark-standalone.html). The following is an example based on spark-env.sh configuration: ``` SPARK_EXECUTOR_MEMORY = 32g @@ -92,7 +92,7 @@ SPARK_WORKER_CORES = 8 #### Hybrid deployment configuration for the TiSpark and TiKV cluster For the hybrid deployment of TiSpark and TiKV, add the TiSpark required resources to the TiKV reserved resources, and allocate 25% of the memory for the system. - + ## Deploy TiSpark @@ -106,7 +106,7 @@ Running TiSpark on an existing Spark cluster does not require a reboot of the cl Spark-shell --jars $ PATH / tispark-${name_with_version}.jar ``` -If you want to deploy TiSpark as a default component, simply place the TiSpark jar package into the jars path for each node of the Spark cluster and restart the Spark cluster: +If you want to deploy TiSpark as a default component, simply place the TiSpark jar package into the path of the jar for each node of the Spark cluster and restart the Spark cluster: ``` $ {SPARK_INSTALL_PATH} / jars @@ -127,13 +127,13 @@ If you do not have a Spark cluster, we recommend using the Spark Standalone mode You can download [Apache Spark](https://spark.apache.org/downloads.html) For the Standalone mode without Hadoop support, use Spark 2.1.x and any version of Pre-build with Apache Hadoop 2.x with Hadoop dependencies. If you need to use the Hadoop cluster, please choose the corresponding Hadoop version. You can also choose to build from the [source code](https://spark.apache.org/docs/2.1.0/building-spark.html) to match the previous version of the official Hadoop 2.6. Please note that TiSpark currently only supports Spark 2.1.x version. - -Suppose you already have a Spark binaries, and the current PATH is `SPARKPATH`, please copy the TiSpark jar package to the `$ {SPARKPATH} / jars` directory. + +Suppose you already have a Spark binary, and the current PATH is `SPARKPATH`, please copy the TiSpark jar package to the `$ {SPARKPATH} / jars` directory. #### Starting a Master node Execute the following command on the selected Spark Master node: - + ``` cd $ SPARKPATH @@ -151,18 +151,18 @@ Similarly, you can start a Spark-Slave node with the following command: ./sbin/start-slave.sh spark: // spark-master-hostname: 7077 ``` -After the command returns, you can see if the Slave node is joined to the Spark cluster correctly from the panel as well. Repeat the above command at all Slave nodes. After all Slaves are connected to the master, you have a Standalone mode Spark cluster. +After the command returns, you can see if the Slave node is joined to the Spark cluster correctly from the panel as well. Repeat the above command at all Slave nodes. After all the slaves are connected to the master, you have a Standalone mode Spark cluster. #### Spark SQL shell and JDBC Server -If you want to use JDBC server and interactive SQL shell, please copy `start-tithriftserver.sh stop-tithriftserver.sh` to your Spark's sbin folder and `tispark-sql` to bin folder. +If you want to use JDBC server and interactive SQL shell, please copy `start-tithriftserver.sh stop-tithriftserver.sh` to your Spark's sbin folder and `tispark-sql` to bin folder. To start interactive shell: ``` ./bin/tispark-sql ``` -To use Thrift Server, you can start it similar way as default Spark Thrift Server: +To use Thrift Server, you can start it a similar way as default Spark Thrift Server: ``` ./sbin/start-tithriftserver.sh ``` @@ -185,7 +185,7 @@ And stop it like below: Assuming you have successfully started the TiSpark cluster as described above, here's a quick introduction to how to use Spark SQL for OLAP analysis. Here we use a table named `lineitem` in the `tpch` database as an example. -Add +Add ``` spark.tispark.pd.addresses 192.168.1.100:2379 ``` @@ -213,7 +213,7 @@ The result is: | 600000000 | +-------------+ ``` - + TiSpark's SQL Interactive shell is almost the same as spark-sql shell. ``` @@ -253,11 +253,11 @@ TiSparkR is a thin layer built for supporting R language with TiSpark Refer to [this document](../R/README.md) for usage. ## TiSpark on PySpark -TiSpark on PySpark is a Python package build to support the Python language with TiSpark. +TiSpark on PySpark is a Python package build to support the Python language with TiSpark. Refer to [this document](../python/README.md) for usage. ## Use TiSpark together with Hive -TiSpark should be ok to use together with Hive. +TiSpark should be ok to use together with Hive. You need to set environment variable HADOOP_CONF_DIR to your Hadoop's configuration folder and copy hive-site.xml to spark/conf folder before starting Spark. ``` val tisparkDF = spark.sql("select * from tispark_table").toDF @@ -287,13 +287,13 @@ df.write .option("isolationLevel", "NONE") // recommended to set isolationLevel to NONE if you have a large DF to load. .option("user", "root") // TiDB user here .save() -``` +``` It is recommended to set `isolationLevel` to `NONE` to avoid large single transactions which may potentially lead to TiDB OOM. ## Statistics information -TiSpark could use TiDB's statistic information for +TiSpark could use TiDB's statistic information for -1. Determining which index to ues in your query plan with the estimated lowest cost. +1. Determining which index to use in your query plan with the estimated lowest cost. 2. Small table broadcasting, which enables efficient broadcast join. If you would like TiSpark to use statistic information, first you need to make sure that concerning tables have already been analyzed. Read more about how to analyze tables [here](https://github.com/pingcap/docs/blob/master/sql/statistics.md). @@ -306,25 +306,25 @@ val ti = new TiContext(spark) // You can specify whether to load statistics information automatically during database mapping in your config file described as below. // Statistics information will be loaded automatically by default, which is recommended in most cases. ti.tidbMapDatabase("db_name") - -// Another option is manually load by yourself, yet this is not the recommended way. - + +// Another option is manually loaded by yourself, yet this is not the recommended way. + // Firstly, get the table that you want to load statistics information from. val table = ti.meta.getTable("db_name", "tb_name").get - + // If you want to load statistics information for all the columns, use ti.statisticsManager.loadStatisticsInfo(table) - + // If you just want to use some of the columns' statistics information, use ti.statisticsManager.loadStatisticsInfo(table, "col1", "col2", "col3") // You could specify required columns by vararg - + // Collect other tables' statistic information... - -// Then you could query as usual, TiSpark will use statistic information collect to optimized index selection and broadcast join. + +// Then you could query, as usual, TiSpark will use statistic information collect to optimized index selection and broadcast join. ``` Note that table statistics will be cached in your spark driver node's memory, so you need to make sure that your memory should be enough for your statistics information. -Currently you could adjust these configs in your spark.conf file. - +Currently, you could adjust these configs in your spark.conf file. + | Property Name | Default | Description | -------- | -----: | :----: | | spark.tispark.statistics.auto_load | true | Whether to load statistics info automatically during database mapping. | @@ -341,12 +341,12 @@ Currently you could adjust these configs in your spark.conf file. |Block manager port |(random) | spark.blockManager.port | | |Shuffle server |7337 | spark.shuffle.service.port | Optional; only applies if you use the external shuffle service. | |Application web UI |4040| spark.ui.port | if 4040 is used, then 4041 will be used - + ## FAQ Q: What are the pros/cons of independent deployment as opposed to a shared resource with an existing Spark / Hadoop cluster? -A: You can use the existing Spark cluster without a separate deployment, but if the existing cluster is busy, TiSpark will not be able to achieve the desired speed. +A: You can use the existing Spark cluster without a separate deployment, but if the existing cluster is busy, TiSpark will not be able to achieve the desired speed. Q: Can I mix Spark with TiKV? From 73f765e124624b6fd244d1fe9dacbc57907e54bd Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Fri, 16 Aug 2019 09:49:57 +0800 Subject: [PATCH 40/62] add benchmark result for batch write (#1025) --- docs/datasource_api_userguide.md | 45 ++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/docs/datasource_api_userguide.md b/docs/datasource_api_userguide.md index cf95191b13..c9e4774c13 100644 --- a/docs/datasource_api_userguide.md +++ b/docs/datasource_api_userguide.md @@ -341,3 +341,48 @@ The full conversion metrics is as follows. | LONGBLOB | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x: | :x: | :white_check_mark: | :x: | :x: | :x: | | ENUM | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x: | :x: | :x: | | SET | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | + +## Write Benchmark +Tested on 4 machines as follows: + +``` +Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz * 2 = 40Vu +12 * 16G = 188G +``` + +`FIO` test result: + +``` +WRITE: bw=705MiB/s (740MB/s), 705MiB/s-705MiB/s (740MB/s-740MB/s), io=20.0GiB (21.5GB), run=29034-29034msec +``` + +The table schema is: +``` +CREATE TABLE ORDERS (O_ORDERKEY INTEGER NOT NULL, + O_CUSTKEY INTEGER NOT NULL, + O_ORDERSTATUS CHAR(1) NOT NULL, + O_TOTALPRICE DECIMAL(15,2) NOT NULL, + O_ORDERDATE DATE NOT NULL, + O_ORDERPRIORITY CHAR(15) NOT NULL, + O_CLERK CHAR(15) NOT NULL, + O_SHIPPRIORITY INTEGER NOT NULL, + O_COMMENT VARCHAR(79) NOT NULL); + +``` + +### TiSpark Write Benchmark + +| count(*) | data size | parallel number | prepare(s) | prewrite (s) | commit (s) | total (s) | +| ----------- | --------- | --------------- | ---------- | ------------ | ---------- | --------- | +| 1,500,000 | 165M | 2 | 17 | 68 | 62 | 148 | +| 15,000,000 | 1.7G | 24 | 49 | 157 | 119 | 326 | +| 150,000,000 | 17G | 120 | 630 | 1236 | 1098 | 2964 | + + +## Spark with JDBC Benchmark + +| count(*) | data size | parallel number | spark jdbc write (s) | comments | +| ----------- | --------- | --------------- | -------------------- | ----------------------------------- | +| 1,500,000 | 165M | 24 | 22 | | +| 15,000,000 | 1.7G | 24 | 411 | use 120 parallel will cause KV Busy | +| 150,000,000 | 17G | 24 | 2936 | use 120 parallel will cause KV Busy | From 6f00a5d2f6f5cf7ba87cffb713b041944da46e62 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Fri, 16 Aug 2019 11:20:26 +0800 Subject: [PATCH 41/62] release tispark 2.1.3 (#1026) (#1035) (cherry picked from commit 107eb2b270451e5947f80d63b09833890396b7ff) --- CHANGELOG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 67a4910775..c71dbb11c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,14 @@ # TiSpark Changelog All notable changes to this project will be documented in this file. +## [TiSpark 2.1.3] 2019-08-15 +### Fixes +- Fix cost model in table scan [#1023](https://github.com/pingcap/tispark/pull/1023) +- Fix index scan bug [#1024](https://github.com/pingcap/tispark/pull/1024) +- Prohibit aggregate or group by pushdown on double read [#1027](https://github.com/pingcap/tispark/pull/1027) +- Fix reflection bug for HDP release [#1017](https://github.com/pingcap/tispark/pull/1017) +- Fix scala compiler version [#1019](https://github.com/pingcap/tispark/pull/1019) + ## [TiSpark 2.2.0] ### New Features * Natively support writing data to TiKV using Spark Data Source API From 0f7aec6d185503bfce8407d50c693305966fe44b Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Fri, 16 Aug 2019 14:03:41 +0800 Subject: [PATCH 42/62] support setting random seed in daily regression test (#1032) --- .ci/log4j-ci.properties | 1 + .ci/tidb_config-for-daily-test.properties | 2 ++ .../src/test/resources/tidb_config.properties.template | 2 ++ .../scala/org/apache/spark/sql/TiSparkTestSpec.scala | 6 ++++-- .../org/apache/spark/sql/test/SharedSQLContext.scala | 10 +++++++++- .../org/apache/spark/sql/test/TestConstants.scala | 1 + 6 files changed, 19 insertions(+), 3 deletions(-) create mode 100644 .ci/tidb_config-for-daily-test.properties diff --git a/.ci/log4j-ci.properties b/.ci/log4j-ci.properties index cbf0f4a305..ba84ec74d5 100644 --- a/.ci/log4j-ci.properties +++ b/.ci/log4j-ci.properties @@ -25,3 +25,4 @@ log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR # tispark log4j.logger.com.pingcap=ERROR log4j.logger.com.pingcap.tispark.utils.ReflectionUtil=DEBUG +log4j.logger.org.apache.spark.sql.test.SharedSQLContext=DEBUG diff --git a/.ci/tidb_config-for-daily-test.properties b/.ci/tidb_config-for-daily-test.properties new file mode 100644 index 0000000000..45056d6e80 --- /dev/null +++ b/.ci/tidb_config-for-daily-test.properties @@ -0,0 +1,2 @@ +# The seed used to generate test data (0 means random). +test.data.generate.seed=0 diff --git a/core/src/test/resources/tidb_config.properties.template b/core/src/test/resources/tidb_config.properties.template index 2babbfcd7f..34281c4406 100644 --- a/core/src/test/resources/tidb_config.properties.template +++ b/core/src/test/resources/tidb_config.properties.template @@ -21,5 +21,7 @@ # test.data.load=auto # Whether to generate test data. Enabling test data generation may change data of all tests. # test.data.generate=true +# The seed used to generate test data (0 means random). +# test.data.generate.seed=1234 # DB prefix for tidb databases in case it conflicts with hive database # spark.tispark.db_prefix=tidb_ diff --git a/core/src/test/scala/org/apache/spark/sql/TiSparkTestSpec.scala b/core/src/test/scala/org/apache/spark/sql/TiSparkTestSpec.scala index cf41dff643..1115c47c4f 100644 --- a/core/src/test/scala/org/apache/spark/sql/TiSparkTestSpec.scala +++ b/core/src/test/scala/org/apache/spark/sql/TiSparkTestSpec.scala @@ -17,13 +17,15 @@ package org.apache.spark.sql +import org.apache.spark.sql.test.SharedSQLContext + import scala.util.Random -trait TiSparkTestSpec { +trait TiSparkTestSpec extends SharedSQLContext { val database: String val testDesc: String // Randomizer for tests - val r: Random = new Random(1234) + val r: Random = new Random(generateDataSeed) def test(): Unit } diff --git a/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala b/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala index 8e7deb61f8..65260aa6a6 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala @@ -103,6 +103,8 @@ trait SharedSQLContext extends SparkFunSuite with Eventually with BeforeAndAfter protected def generateData: Boolean = SharedSQLContext.generateData + protected def generateDataSeed: Long = SharedSQLContext.generateDataSeed + /** * The [[TestSparkSession]] to use for all tests in this suite. */ @@ -163,6 +165,7 @@ object SharedSQLContext extends Logging { protected var tidbPort: Int = _ protected var pdAddresses: String = _ protected var generateData: Boolean = _ + protected var generateDataSeed: Long = _ protected implicit def spark: SparkSession = _spark @@ -391,8 +394,13 @@ object SharedSQLContext extends Logging { generateData = getOrElse(_tidbConf, SHOULD_GENERATE_DATA, "true").toLowerCase.toBoolean + generateDataSeed = getOrElse(_tidbConf, GENERATE_DATA_SEED, "1234").toLong + if (generateDataSeed == 0) { + generateDataSeed = System.currentTimeMillis() + } + if (generateData) { - logger.info("generate data is enabled") + logger.info(s"generate data is enabled and seed is $generateDataSeed") } if (isTidbConfigPropertiesInjectedToSparkEnabled) { diff --git a/core/src/test/scala/org/apache/spark/sql/test/TestConstants.scala b/core/src/test/scala/org/apache/spark/sql/test/TestConstants.scala index d5550593b4..8f2d10bbae 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/TestConstants.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/TestConstants.scala @@ -26,5 +26,6 @@ object TestConstants { val TPCDS_DB_NAME = "tpcds.db" val SHOULD_LOAD_DATA = "test.data.load" val SHOULD_GENERATE_DATA = "test.data.generate" + val GENERATE_DATA_SEED = "test.data.generate.seed" val SHOULD_SKIP_TEST = "test.skip" } From e7998639876d53517d86363fff87e9a561c31376 Mon Sep 17 00:00:00 2001 From: Zhexuan Yang Date: Fri, 16 Aug 2019 15:28:20 +0800 Subject: [PATCH 43/62] Remove create in tisession (#1021) --- .../org/apache/spark/sql/TiContext.scala | 2 +- .../tispark/convert/ToUnsignedSuite.scala | 4 +- .../datasource/BaseDataSourceTest.scala | 7 +- .../datasource/RowIDAllocatorSuite.scala | 7 + .../sql/catalyst/plans/BasePlanTest.scala | 20 -- .../statistics/StatisticsTestSuite.scala | 22 ++ pom.xml | 3 +- tikv-client/README.md | 2 +- .../main/java/com/pingcap/tikv/TiSession.java | 24 +- .../tikv/allocator/RowIDAllocator.java | 18 +- .../com/pingcap/tikv/PDMockServerTest.java | 2 +- .../pingcap/tikv/txn/LockResolverRCTest.java | 98 +++++++ .../pingcap/tikv/txn/LockResolverSITest.java | 195 ++++++++++++++ .../pingcap/tikv/txn/LockResolverTest.java | 249 ++---------------- 14 files changed, 376 insertions(+), 277 deletions(-) create mode 100644 tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverRCTest.java create mode 100644 tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverSITest.java diff --git a/core/src/main/scala/org/apache/spark/sql/TiContext.scala b/core/src/main/scala/org/apache/spark/sql/TiContext.scala index 539531d56f..4be81bb05e 100644 --- a/core/src/main/scala/org/apache/spark/sql/TiContext.scala +++ b/core/src/main/scala/org/apache/spark/sql/TiContext.scala @@ -43,7 +43,7 @@ class TiContext(val sparkSession: SparkSession, options: Option[TiDBOptions] = N lazy val sqlContext: SQLContext = sparkSession.sqlContext val conf: SparkConf = mergeWithDataSourceConfig(sparkSession.sparkContext.conf, options) val tiConf: TiConfiguration = TiUtil.sparkConfToTiConf(conf) - val tiSession: TiSession = TiSession.create(tiConf) + val tiSession: TiSession = TiSession.getInstance(tiConf) val meta: MetaManager = new MetaManager(tiSession.getCatalog) StatisticsManager.initStatisticsManager(tiSession) diff --git a/core/src/test/scala/com/pingcap/tispark/convert/ToUnsignedSuite.scala b/core/src/test/scala/com/pingcap/tispark/convert/ToUnsignedSuite.scala index c67a411236..6e6b6bcb3a 100644 --- a/core/src/test/scala/com/pingcap/tispark/convert/ToUnsignedSuite.scala +++ b/core/src/test/scala/com/pingcap/tispark/convert/ToUnsignedSuite.scala @@ -1,5 +1,8 @@ package com.pingcap.tispark.convert +import com.pingcap.tikv.TiSession +import com.pingcap.tikv.allocator.RowIDAllocator +import com.pingcap.tispark.TiDBUtils import com.pingcap.tispark.datasource.BaseDataSourceTest import org.apache.spark.sql.Row import org.apache.spark.sql.types._ @@ -13,7 +16,6 @@ import org.apache.spark.sql.types._ * 5. BIGINT UNSIGNED */ class ToUnsignedSuite extends BaseDataSourceTest("test_data_type_convert_to_unsigned") { - private val readSchema = StructType( List( StructField("i", IntegerType), diff --git a/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala b/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala index e8cddca847..ffa3b86471 100644 --- a/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala +++ b/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala @@ -2,6 +2,7 @@ package com.pingcap.tispark.datasource import java.util.Objects +import com.pingcap.tikv.TiSession import com.pingcap.tispark.TiConfigConst import org.apache.spark.SparkException import org.apache.spark.rdd.RDD @@ -32,7 +33,11 @@ class BaseDataSourceTest(val table: String, protected def jdbcUpdate(query: String): Unit = tidbStmt.execute(query) - protected def dropTable(): Unit = jdbcUpdate(s"drop table if exists $dbtable") + protected def dropTable(): Unit = { + jdbcUpdate(s"drop table if exists $dbtable") + // If we reuse tiSession, cache in catalog will be outdated after dropping and creating table. + TiSession.clearCache() + } protected def tidbWrite(rows: List[Row], schema: StructType, diff --git a/core/src/test/scala/com/pingcap/tispark/datasource/RowIDAllocatorSuite.scala b/core/src/test/scala/com/pingcap/tispark/datasource/RowIDAllocatorSuite.scala index defd518d0f..b8ed0664e7 100644 --- a/core/src/test/scala/com/pingcap/tispark/datasource/RowIDAllocatorSuite.scala +++ b/core/src/test/scala/com/pingcap/tispark/datasource/RowIDAllocatorSuite.scala @@ -34,13 +34,20 @@ class RowIDAllocatorSuite extends BaseTiSparkTest { val tiDBInfo = ti.tiSession.getCatalog.getDatabase(dbName) val tiTableInfo = ti.tiSession.getCatalog.getTable(dbName, tableName) + // first var allocator = RowIDAllocator.create(tiDBInfo.getId, tiTableInfo.getId, ti.tiSession.getConf, false, 1000) assert(allocator.getEnd - allocator.getStart == 1000) + // second allocator = RowIDAllocator .create(tiDBInfo.getId, tiTableInfo.getId, ti.tiSession.getConf, false, 10000) assert(allocator.getEnd - allocator.getStart == 10000) + + // third + allocator = + RowIDAllocator.create(tiDBInfo.getId, tiTableInfo.getId, ti.tiSession.getConf, false, 1000) + assert(allocator.getEnd - allocator.getStart == 1000) } override def afterAll(): Unit = diff --git a/core/src/test/scala/org/apache/spark/sql/catalyst/plans/BasePlanTest.scala b/core/src/test/scala/org/apache/spark/sql/catalyst/plans/BasePlanTest.scala index e81bfa82cd..4978450396 100644 --- a/core/src/test/scala/org/apache/spark/sql/catalyst/plans/BasePlanTest.scala +++ b/core/src/test/scala/org/apache/spark/sql/catalyst/plans/BasePlanTest.scala @@ -159,24 +159,4 @@ class BasePlanTest extends BaseTiSparkTest { assert(estimatedRowCount === answer) } - test("baseline test") { - val tableName = "full_data_type_table_idx" - val df = spark.sql( - s"select tp_bigint from $tableName where tp_tinyint = 0 and tp_int < 0" - ) - - checkIndex(df, "idx_tp_tinyint_tp_int") - - assertThrows[TestFailedException] { - checkIsTableScan(df, tableName) - } - - checkIsIndexScan(df, tableName) - - assertThrows[TestFailedException] { - checkIsCoveringIndexScan(df, tableName) - } - - checkEstimatedRowCount(df, tableName, 2) - } } diff --git a/core/src/test/scala/org/apache/spark/sql/catalyst/plans/statistics/StatisticsTestSuite.scala b/core/src/test/scala/org/apache/spark/sql/catalyst/plans/statistics/StatisticsTestSuite.scala index 7f5513a99e..1688014e59 100644 --- a/core/src/test/scala/org/apache/spark/sql/catalyst/plans/statistics/StatisticsTestSuite.scala +++ b/core/src/test/scala/org/apache/spark/sql/catalyst/plans/statistics/StatisticsTestSuite.scala @@ -23,6 +23,7 @@ import com.pingcap.tikv.predicates.PredicateUtils.expressionToIndexRanges import com.pingcap.tikv.predicates.TiKVScanAnalyzer import com.pingcap.tispark.statistics.StatisticsManager import org.apache.spark.sql.catalyst.plans.BasePlanTest +import org.scalatest.exceptions.TestFailedException import scala.collection.JavaConverters._ @@ -187,6 +188,27 @@ class StatisticsTestSuite extends BasePlanTest { } } + test("baseline test") { + val tableName = "full_data_type_table_idx" + val df = spark.sql( + s"select tp_bigint from $tableName where tp_tinyint = 0 and tp_int < 0" + ) + + checkIndex(df, "idx_tp_tinyint_tp_int") + + assertThrows[TestFailedException] { + checkIsTableScan(df, tableName) + } + + checkIsIndexScan(df, tableName) + + assertThrows[TestFailedException] { + checkIsCoveringIndexScan(df, tableName) + } + + checkEstimatedRowCount(df, tableName, 2) + } + override def afterAll(): Unit = try { tidbStmt.execute("DROP TABLE IF EXISTS `tb_fixed_float`") diff --git a/pom.xml b/pom.xml index 66ea97bbde..a72f5d7511 100644 --- a/pom.xml +++ b/pom.xml @@ -119,8 +119,7 @@ suites true - 1 - true + always 3.0.0-M3 diff --git a/tikv-client/README.md b/tikv-client/README.md index 6472b143f8..e7928c33b0 100644 --- a/tikv-client/README.md +++ b/tikv-client/README.md @@ -29,7 +29,7 @@ It is not recommended to use tikv java client directly; it is better to use toge ```java // Init tidb cluster configuration TiConfiguration conf = TiConfiguration.createDefault("127.0.0.1:2379"); -TiSession session = TiSession.create(conf); +TiSession session = TiSession.getInstance(conf); Catalog cat = session.getCatalog(); TiDBInfo db = cat.getDatabase("tpch_test"); TiTableInfo table = cat.getTable(db, "customer"); diff --git a/tikv-client/src/main/java/com/pingcap/tikv/TiSession.java b/tikv-client/src/main/java/com/pingcap/tikv/TiSession.java index 60f00b17b9..46ee4f6d73 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/TiSession.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/TiSession.java @@ -29,11 +29,8 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.function.Function; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class TiSession implements AutoCloseable { - private final Logger logger = LoggerFactory.getLogger(TiSession.class); private final TiConfiguration conf; private final ChannelFactory channelFactory; private Function cacheInvalidateCallback; @@ -46,7 +43,7 @@ public class TiSession implements AutoCloseable { private volatile RegionManager regionManager; private volatile RegionStoreClient.RegionStoreClientBuilder clientBuilder; - private static Map sessionCachedMap = new HashMap<>(); + private static final Map sessionCachedMap = new HashMap<>(); // Since we create session as singleton now, configuration change will not // reflect change @@ -180,10 +177,6 @@ public ExecutorService getThreadPoolForTableScan() { return res; } - public static TiSession create(TiConfiguration conf) { - return new TiSession(conf); - } - /** * This is used for setting call back function to invalidate cache information * @@ -193,10 +186,19 @@ public void injectCallBackFunc(Function callBackFunc this.cacheInvalidateCallback = callBackFunc; } + public static void clearCache() { + TiSession.sessionCachedMap.clear(); + } + @Override - public void close() throws Exception { - getThreadPoolForTableScan().shutdownNow(); - getThreadPoolForIndexScan().shutdownNow(); + public synchronized void close() throws Exception { + sessionCachedMap.remove(conf.getPdAddrsString()); + if (tableScanThreadPool != null) { + tableScanThreadPool.shutdownNow(); + } + if (indexScanThreadPool != null) { + indexScanThreadPool.shutdownNow(); + } if (client != null) { getPDClient().close(); } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/allocator/RowIDAllocator.java b/tikv-client/src/main/java/com/pingcap/tikv/allocator/RowIDAllocator.java index f681eea220..325165b016 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/allocator/RowIDAllocator.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/allocator/RowIDAllocator.java @@ -49,9 +49,9 @@ public static RowIDAllocator create( long dbId, long tableId, TiConfiguration conf, boolean unsigned, long step) { RowIDAllocator allocator = new RowIDAllocator(dbId, step, conf); if (unsigned) { - allocator.initUnsigned(TiSession.create(conf).createSnapshot(), tableId); + allocator.initUnsigned(TiSession.getInstance(conf).createSnapshot(), tableId); } else { - allocator.initSigned(TiSession.create(conf).createSnapshot(), tableId); + allocator.initSigned(TiSession.getInstance(conf).createSnapshot(), tableId); } return allocator; } @@ -170,11 +170,15 @@ public long getAutoTableId(long dbId, long tableId, long step, Snapshot snapshot /** read current row id from TiKV according to database id and table id. */ public long getAutoTableId(long dbId, long tableId, Snapshot snapshot) { - ByteString dbKey = MetaCodec.encodeDatabaseID(dbId); - ByteString tblKey = MetaCodec.autoTableIDKey(tableId); - ByteString val = MetaCodec.hashGet(dbKey, tblKey, snapshot); - if (val.isEmpty()) return 0L; - return Long.parseLong(val.toStringUtf8()); + if (isDBExisted(dbId, snapshot) && isTableExisted(dbId, tableId, snapshot)) { + ByteString dbKey = MetaCodec.encodeDatabaseID(dbId); + ByteString tblKey = MetaCodec.autoTableIDKey(tableId); + ByteString val = MetaCodec.hashGet(dbKey, tblKey, snapshot); + if (val.isEmpty()) return 0L; + return Long.parseLong(val.toStringUtf8()); + } + + throw new IllegalArgumentException("table or database is not existed"); } private void initSigned(Snapshot snapshot, long tableId) { diff --git a/tikv-client/src/test/java/com/pingcap/tikv/PDMockServerTest.java b/tikv-client/src/test/java/com/pingcap/tikv/PDMockServerTest.java index 991cd6a14a..6587e3f092 100644 --- a/tikv-client/src/test/java/com/pingcap/tikv/PDMockServerTest.java +++ b/tikv-client/src/test/java/com/pingcap/tikv/PDMockServerTest.java @@ -40,7 +40,7 @@ void setUp(String addr) throws IOException { GrpcUtils.makeMember(2, "http://" + addr + ":" + (pdServer.port + 1)), GrpcUtils.makeMember(3, "http://" + addr + ":" + (pdServer.port + 2)))); TiConfiguration conf = TiConfiguration.createDefault(addr + ":" + pdServer.port); - session = TiSession.create(conf); + session = TiSession.getInstance(conf); } @After diff --git a/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverRCTest.java b/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverRCTest.java new file mode 100644 index 0000000000..e763592b26 --- /dev/null +++ b/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverRCTest.java @@ -0,0 +1,98 @@ +/* + * Copyright 2017 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.pingcap.tikv.txn; + +import static junit.framework.TestCase.*; + +import com.google.protobuf.ByteString; +import com.pingcap.tikv.TiConfiguration; +import com.pingcap.tikv.TiSession; +import com.pingcap.tikv.exception.KeyException; +import com.pingcap.tikv.meta.TiTimestamp; +import com.pingcap.tikv.region.RegionStoreClient; +import com.pingcap.tikv.region.TiRegion; +import java.util.Collections; +import org.apache.log4j.Logger; +import org.junit.Before; +import org.junit.Test; +import org.tikv.kvproto.Kvrpcpb.*; + +public class LockResolverRCTest extends LockResolverTest { + private final Logger logger = Logger.getLogger(this.getClass()); + + @Before + public void setUp() { + TiConfiguration conf = TiConfiguration.createDefault("127.0.0.1:2379"); + conf.setIsolationLevel(IsolationLevel.RC); + try { + session = TiSession.getInstance(conf); + pdClient = session.getPDClient(); + this.builder = session.getRegionStoreClientBuilder(); + init = true; + } catch (Exception e) { + logger.warn("TiDB cluster may not be present"); + init = false; + } + } + + @Test + public void getRCTest() { + if (!init) { + skipTest(); + return; + } + putAlphabet(); + prepareAlphabetLocks(); + + versionTest(); + } + + @Test + public void RCTest() { + if (!init) { + skipTest(); + return; + } + TiTimestamp startTs = pdClient.getTimestamp(backOffer); + TiTimestamp endTs = pdClient.getTimestamp(backOffer); + + putKV("a", "a", startTs.getVersion(), endTs.getVersion()); + + startTs = pdClient.getTimestamp(backOffer); + endTs = pdClient.getTimestamp(backOffer); + + lockKey("a", "aa", "a", "aa", false, startTs.getVersion(), endTs.getVersion()); + + TiRegion tiRegion = + session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8(String.valueOf('a'))); + RegionStoreClient client = builder.build(tiRegion); + ByteString v = + client.get( + backOffer, + ByteString.copyFromUtf8(String.valueOf('a')), + pdClient.getTimestamp(backOffer).getVersion()); + assertEquals(v.toStringUtf8(), String.valueOf('a')); + + try { + commit( + startTs.getVersion(), + endTs.getVersion(), + Collections.singletonList(ByteString.copyFromUtf8("a"))); + } catch (KeyException e) { + fail(); + } + } +} diff --git a/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverSITest.java b/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverSITest.java new file mode 100644 index 0000000000..5b684c3d8c --- /dev/null +++ b/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverSITest.java @@ -0,0 +1,195 @@ +/* + * Copyright 2017 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.pingcap.tikv.txn; + +import static junit.framework.TestCase.*; + +import com.google.protobuf.ByteString; +import com.pingcap.tikv.TiConfiguration; +import com.pingcap.tikv.TiSession; +import com.pingcap.tikv.exception.KeyException; +import com.pingcap.tikv.meta.TiTimestamp; +import com.pingcap.tikv.region.RegionStoreClient; +import com.pingcap.tikv.region.TiRegion; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import org.apache.log4j.Logger; +import org.junit.Before; +import org.junit.Test; +import org.tikv.kvproto.Kvrpcpb.*; + +public class LockResolverSITest extends LockResolverTest { + private final Logger logger = Logger.getLogger(this.getClass()); + + @Before + public void setUp() { + TiConfiguration conf = TiConfiguration.createDefault("127.0.0.1:2379"); + conf.setIsolationLevel(IsolationLevel.SI); + TiSession.clearCache(); + try { + session = TiSession.getInstance(conf); + pdClient = session.getPDClient(); + this.builder = session.getRegionStoreClientBuilder(); + init = true; + } catch (Exception e) { + logger.warn("TiDB cluster may not be present"); + init = false; + } + } + + @Test + public void getSITest() { + if (!init) { + skipTest(); + return; + } + putAlphabet(); + prepareAlphabetLocks(); + + versionTest(); + } + + @Test + public void cleanLockTest() { + if (!init) { + skipTest(); + return; + } + for (int i = 0; i < 26; i++) { + String k = String.valueOf((char) ('a' + i)); + TiTimestamp startTs = pdClient.getTimestamp(backOffer); + TiTimestamp endTs = pdClient.getTimestamp(backOffer); + lockKey(k, k, k, k, false, startTs.getVersion(), endTs.getVersion()); + } + + List mutations = new ArrayList<>(); + List keys = new ArrayList<>(); + for (int i = 0; i < 26; i++) { + String k = String.valueOf((char) ('a' + i)); + String v = String.valueOf((char) ('a' + i + 1)); + Mutation m = + Mutation.newBuilder() + .setKey(ByteString.copyFromUtf8(k)) + .setOp(Op.Put) + .setValue(ByteString.copyFromUtf8(v)) + .build(); + mutations.add(m); + keys.add(ByteString.copyFromUtf8(k)); + } + + TiTimestamp startTs = pdClient.getTimestamp(backOffer); + TiTimestamp endTs = pdClient.getTimestamp(backOffer); + + boolean res = prewrite(mutations, startTs.getVersion(), mutations.get(0)); + assertTrue(res); + res = commit(startTs.getVersion(), endTs.getVersion(), keys); + assertTrue(res); + + for (int i = 0; i < 26; i++) { + TiRegion tiRegion = + session + .getRegionManager() + .getRegionByKey(ByteString.copyFromUtf8(String.valueOf((char) ('a' + i)))); + RegionStoreClient client = builder.build(tiRegion); + ByteString v = + client.get( + backOffer, + ByteString.copyFromUtf8(String.valueOf((char) ('a' + i))), + pdClient.getTimestamp(backOffer).getVersion()); + assertEquals(v.toStringUtf8(), String.valueOf((char) ('a' + i + 1))); + } + } + + @Test + public void txnStatusTest() { + if (!init) { + skipTest(); + return; + } + TiTimestamp startTs = pdClient.getTimestamp(backOffer); + TiTimestamp endTs = pdClient.getTimestamp(backOffer); + + putKV("a", "a", startTs.getVersion(), endTs.getVersion()); + TiRegion tiRegion = + session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8(String.valueOf('a'))); + RegionStoreClient client = builder.build(tiRegion); + long status = + client.lockResolverClient.getTxnStatus( + backOffer, startTs.getVersion(), ByteString.copyFromUtf8(String.valueOf('a'))); + assertEquals(status, endTs.getVersion()); + + startTs = pdClient.getTimestamp(backOffer); + endTs = pdClient.getTimestamp(backOffer); + + lockKey("a", "a", "a", "a", true, startTs.getVersion(), endTs.getVersion()); + tiRegion = + session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8(String.valueOf('a'))); + client = builder.build(tiRegion); + status = + client.lockResolverClient.getTxnStatus( + backOffer, startTs.getVersion(), ByteString.copyFromUtf8(String.valueOf('a'))); + assertEquals(status, endTs.getVersion()); + + startTs = pdClient.getTimestamp(backOffer); + endTs = pdClient.getTimestamp(backOffer); + + lockKey("a", "a", "a", "a", false, startTs.getVersion(), endTs.getVersion()); + tiRegion = + session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8(String.valueOf('a'))); + client = builder.build(tiRegion); + status = + client.lockResolverClient.getTxnStatus( + backOffer, startTs.getVersion(), ByteString.copyFromUtf8(String.valueOf('a'))); + assertNotSame(status, endTs.getVersion()); + } + + @Test + public void SITest() { + if (!init) { + skipTest(); + return; + } + TiTimestamp startTs = pdClient.getTimestamp(backOffer); + TiTimestamp endTs = pdClient.getTimestamp(backOffer); + + putKV("a", "a", startTs.getVersion(), endTs.getVersion()); + + startTs = pdClient.getTimestamp(backOffer); + endTs = pdClient.getTimestamp(backOffer); + + lockKey("a", "aa", "a", "aa", false, startTs.getVersion(), endTs.getVersion()); + + TiRegion tiRegion = + session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8(String.valueOf('a'))); + RegionStoreClient client = builder.build(tiRegion); + ByteString v = + client.get( + backOffer, + ByteString.copyFromUtf8(String.valueOf('a')), + pdClient.getTimestamp(backOffer).getVersion()); + assertEquals(v.toStringUtf8(), String.valueOf('a')); + + try { + commit( + startTs.getVersion(), + endTs.getVersion(), + Collections.singletonList(ByteString.copyFromUtf8("a"))); + fail(); + } catch (KeyException e) { + } + } +} diff --git a/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverTest.java b/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverTest.java index 005bf469be..296bd2ee41 100644 --- a/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverTest.java +++ b/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverTest.java @@ -20,7 +20,6 @@ import com.google.protobuf.ByteString; import com.pingcap.tikv.ReadOnlyPDClient; -import com.pingcap.tikv.TiConfiguration; import com.pingcap.tikv.TiSession; import com.pingcap.tikv.exception.KeyException; import com.pingcap.tikv.exception.RegionException; @@ -38,19 +37,22 @@ import java.util.function.Supplier; import org.apache.log4j.Logger; import org.junit.Before; -import org.junit.Test; import org.tikv.kvproto.Kvrpcpb.*; import org.tikv.kvproto.TikvGrpc; -public class LockResolverTest { +public abstract class LockResolverTest { private final Logger logger = Logger.getLogger(this.getClass()); - private TiSession session; + TiSession session; private static final int DefaultTTL = 10; - private BackOffer backOffer = ConcreteBackOffer.newCustomBackOff(1000); - private ReadOnlyPDClient pdClient; - private boolean init; + BackOffer backOffer = ConcreteBackOffer.newCustomBackOff(1000); + ReadOnlyPDClient pdClient; + RegionStoreClient.RegionStoreClientBuilder builder; + boolean init; - private void putKV(String key, String value, long startTS, long commitTS) { + @Before + public abstract void setUp(); + + void putKV(String key, String value, long startTS, long commitTS) { Mutation m = Mutation.newBuilder() .setKey(ByteString.copyFromUtf8(key)) @@ -64,7 +66,7 @@ private void putKV(String key, String value, long startTS, long commitTS) { assertTrue(res); } - private boolean prewrite(List mutations, long startTS, Mutation primary) { + boolean prewrite(List mutations, long startTS, Mutation primary) { if (mutations.size() == 0) return true; for (Mutation m : mutations) { @@ -131,7 +133,7 @@ private boolean prewrite(List mutations, long startTS, Mutation primar return true; } - private boolean lockKey( + boolean lockKey( String key, String value, String primaryKey, @@ -171,7 +173,7 @@ private boolean lockKey( return true; } - private boolean commit(long startTS, long commitTS, List keys) { + boolean commit(long startTS, long commitTS, List keys) { if (keys.size() == 0) return true; for (ByteString k : keys) { @@ -208,7 +210,7 @@ private boolean commit(long startTS, long commitTS, List keys) { return true; } - private void putAlphabet() { + void putAlphabet() { for (int i = 0; i < 26; i++) { long startTs = pdClient.getTimestamp(backOffer).getVersion(); long endTs = pdClient.getTimestamp(backOffer).getVersion(); @@ -220,7 +222,7 @@ private void putAlphabet() { versionTest(); } - private void prepareAlphabetLocks() { + void prepareAlphabetLocks() { TiTimestamp startTs = pdClient.getTimestamp(backOffer); TiTimestamp endTs = pdClient.getTimestamp(backOffer); while (startTs == endTs) { @@ -242,40 +244,11 @@ private void prepareAlphabetLocks() { assertTrue(lockKey("d", "dd", "z2", "z2", false, startTs.getVersion(), endTs.getVersion())); } - private RegionStoreClient.RegionStoreClientBuilder builder; - - @Before - public void setUp() { - TiConfiguration conf = TiConfiguration.createDefault("127.0.0.1:2379"); - try { - session = TiSession.create(conf); - pdClient = session.getPDClient(); - this.builder = session.getRegionStoreClientBuilder(); - init = true; - } catch (Exception e) { - logger.warn("TiDB cluster may not be present"); - init = false; - } - } - - private void skipTest() { + void skipTest() { logger.warn("Test skipped due to failure in initializing pd client."); } - @Test - public void getSITest() { - if (!init) { - skipTest(); - return; - } - session.getConf().setIsolationLevel(IsolationLevel.SI); - putAlphabet(); - prepareAlphabetLocks(); - - versionTest(); - } - - private void versionTest() { + void versionTest() { for (int i = 0; i < 26; i++) { TiRegion tiRegion = session @@ -290,192 +263,4 @@ private void versionTest() { assertEquals(v.toStringUtf8(), String.valueOf((char) ('a' + i))); } } - - @Test - public void getRCTest() { - if (!init) { - skipTest(); - return; - } - session.getConf().setIsolationLevel(IsolationLevel.RC); - putAlphabet(); - prepareAlphabetLocks(); - - versionTest(); - } - - @Test - public void cleanLockTest() { - if (!init) { - skipTest(); - return; - } - session.getConf().setIsolationLevel(IsolationLevel.SI); - for (int i = 0; i < 26; i++) { - String k = String.valueOf((char) ('a' + i)); - TiTimestamp startTs = pdClient.getTimestamp(backOffer); - TiTimestamp endTs = pdClient.getTimestamp(backOffer); - lockKey(k, k, k, k, false, startTs.getVersion(), endTs.getVersion()); - } - - List mutations = new ArrayList<>(); - List keys = new ArrayList<>(); - for (int i = 0; i < 26; i++) { - String k = String.valueOf((char) ('a' + i)); - String v = String.valueOf((char) ('a' + i + 1)); - Mutation m = - Mutation.newBuilder() - .setKey(ByteString.copyFromUtf8(k)) - .setOp(Op.Put) - .setValue(ByteString.copyFromUtf8(v)) - .build(); - mutations.add(m); - keys.add(ByteString.copyFromUtf8(k)); - } - - TiTimestamp startTs = pdClient.getTimestamp(backOffer); - TiTimestamp endTs = pdClient.getTimestamp(backOffer); - - boolean res = prewrite(mutations, startTs.getVersion(), mutations.get(0)); - assertTrue(res); - res = commit(startTs.getVersion(), endTs.getVersion(), keys); - assertTrue(res); - - for (int i = 0; i < 26; i++) { - TiRegion tiRegion = - session - .getRegionManager() - .getRegionByKey(ByteString.copyFromUtf8(String.valueOf((char) ('a' + i)))); - RegionStoreClient client = builder.build(tiRegion); - ByteString v = - client.get( - backOffer, - ByteString.copyFromUtf8(String.valueOf((char) ('a' + i))), - pdClient.getTimestamp(backOffer).getVersion()); - assertEquals(v.toStringUtf8(), String.valueOf((char) ('a' + i + 1))); - } - - session.getConf().setIsolationLevel(IsolationLevel.RC); - } - - @Test - public void txnStatusTest() { - if (!init) { - skipTest(); - return; - } - session.getConf().setIsolationLevel(IsolationLevel.SI); - TiTimestamp startTs = pdClient.getTimestamp(backOffer); - TiTimestamp endTs = pdClient.getTimestamp(backOffer); - - putKV("a", "a", startTs.getVersion(), endTs.getVersion()); - TiRegion tiRegion = - session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8(String.valueOf('a'))); - RegionStoreClient client = builder.build(tiRegion); - long status = - client.lockResolverClient.getTxnStatus( - backOffer, startTs.getVersion(), ByteString.copyFromUtf8(String.valueOf('a'))); - assertEquals(status, endTs.getVersion()); - - startTs = pdClient.getTimestamp(backOffer); - endTs = pdClient.getTimestamp(backOffer); - - lockKey("a", "a", "a", "a", true, startTs.getVersion(), endTs.getVersion()); - tiRegion = - session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8(String.valueOf('a'))); - client = builder.build(tiRegion); - status = - client.lockResolverClient.getTxnStatus( - backOffer, startTs.getVersion(), ByteString.copyFromUtf8(String.valueOf('a'))); - assertEquals(status, endTs.getVersion()); - - startTs = pdClient.getTimestamp(backOffer); - endTs = pdClient.getTimestamp(backOffer); - - lockKey("a", "a", "a", "a", false, startTs.getVersion(), endTs.getVersion()); - tiRegion = - session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8(String.valueOf('a'))); - client = builder.build(tiRegion); - status = - client.lockResolverClient.getTxnStatus( - backOffer, startTs.getVersion(), ByteString.copyFromUtf8(String.valueOf('a'))); - assertNotSame(status, endTs.getVersion()); - - session.getConf().setIsolationLevel(IsolationLevel.RC); - } - - @Test - public void SITest() { - if (!init) { - skipTest(); - return; - } - session.getConf().setIsolationLevel(IsolationLevel.SI); - TiTimestamp startTs = pdClient.getTimestamp(backOffer); - TiTimestamp endTs = pdClient.getTimestamp(backOffer); - - putKV("a", "a", startTs.getVersion(), endTs.getVersion()); - - startTs = pdClient.getTimestamp(backOffer); - endTs = pdClient.getTimestamp(backOffer); - - lockKey("a", "aa", "a", "aa", false, startTs.getVersion(), endTs.getVersion()); - - TiRegion tiRegion = - session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8(String.valueOf('a'))); - RegionStoreClient client = builder.build(tiRegion); - ByteString v = - client.get( - backOffer, - ByteString.copyFromUtf8(String.valueOf('a')), - pdClient.getTimestamp(backOffer).getVersion()); - assertEquals(v.toStringUtf8(), String.valueOf('a')); - - try { - commit( - startTs.getVersion(), - endTs.getVersion(), - Collections.singletonList(ByteString.copyFromUtf8("a"))); - fail(); - } catch (KeyException e) { - } - session.getConf().setIsolationLevel(IsolationLevel.RC); - } - - @Test - public void RCTest() { - if (!init) { - skipTest(); - return; - } - session.getConf().setIsolationLevel(IsolationLevel.RC); - TiTimestamp startTs = pdClient.getTimestamp(backOffer); - TiTimestamp endTs = pdClient.getTimestamp(backOffer); - - putKV("a", "a", startTs.getVersion(), endTs.getVersion()); - - startTs = pdClient.getTimestamp(backOffer); - endTs = pdClient.getTimestamp(backOffer); - - lockKey("a", "aa", "a", "aa", false, startTs.getVersion(), endTs.getVersion()); - - TiRegion tiRegion = - session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8(String.valueOf('a'))); - RegionStoreClient client = builder.build(tiRegion); - ByteString v = - client.get( - backOffer, - ByteString.copyFromUtf8(String.valueOf('a')), - pdClient.getTimestamp(backOffer).getVersion()); - assertEquals(v.toStringUtf8(), String.valueOf('a')); - - try { - commit( - startTs.getVersion(), - endTs.getVersion(), - Collections.singletonList(ByteString.copyFromUtf8("a"))); - } catch (KeyException e) { - fail(); - } - } } From 9551f7d9ae86728c35c10dc89fe13afb6d845837 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Fri, 16 Aug 2019 16:32:02 +0800 Subject: [PATCH 44/62] set tikv region size from 96M to 1M (#1031) --- .ci/integration_test.groovy | 4 ++-- config/tikv.toml | 1 + .../scala/org/apache/spark/sql/test/SharedSQLContext.scala | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.ci/integration_test.groovy b/.ci/integration_test.groovy index 9177f7f02e..951519ce44 100644 --- a/.ci/integration_test.groovy +++ b/.ci/integration_test.groovy @@ -187,9 +187,9 @@ def call(ghprbActualCommit, ghprbCommentBody, ghprbPullId, ghprbPullTitle, ghprb killall -9 tikv-server || true killall -9 pd-server || true sleep 10 - bin/pd-server --name=pd --data-dir=pd &>pd.log & + bin/pd-server --name=pd --data-dir=pd --config=go/src/github.com/pingcap/tispark/config/pd.toml &>pd.log & sleep 10 - bin/tikv-server --pd=127.0.0.1:2379 -s tikv --addr=0.0.0.0:20160 --advertise-addr=127.0.0.1:20160 &>tikv.log & + bin/tikv-server --pd=127.0.0.1:2379 -s tikv --addr=0.0.0.0:20160 --advertise-addr=127.0.0.1:20160 --config=go/src/github.com/pingcap/tispark/config/tikv.toml &>tikv.log & sleep 10 ps aux | grep '-server' || true curl -s 127.0.0.1:2379/pd/api/v1/status || true diff --git a/config/tikv.toml b/config/tikv.toml index 7355f5e67e..180ee14e33 100644 --- a/config/tikv.toml +++ b/config/tikv.toml @@ -148,6 +148,7 @@ address = "pushgateway:9091" # bit smaller. # region-max-size = "144MB" # region-split-size = "96MB" +region-split-size = "1MB" [rocksdb] # Maximum number of concurrent background jobs (compactions and flushes) diff --git a/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala b/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala index 65260aa6a6..0604780a36 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala @@ -415,6 +415,7 @@ object SharedSQLContext extends Logging { sparkConf.set("spark.tispark.write.allow_spark_sql", "true") sparkConf.set("spark.tispark.write.without_lock_table", "true") + sparkConf.set("spark.tispark.tikv.region_split_size_in_mb", "1") if (isHiveEnabled) { // delete meta store directory to avoid multiple derby instances SPARK-10872 From e7d51f50a8aa07471ce4ab256cb939ec4e7247d3 Mon Sep 17 00:00:00 2001 From: Zhexuan Yang Date: Sat, 17 Aug 2019 15:04:34 +0800 Subject: [PATCH 45/62] adding unique indices test for batch write (#1014) --- .../datasource/BaseDataSourceTest.scala | 72 ++++++--- .../BatchWriteUniqueIndexSuite.scala | 80 ++++++++++ ...erationUniqueIndexDataTypeTestAction.scala | 88 +++++++++++ ...rator.scala => ColumnValueGenerator.scala} | 138 +++++++++--------- .../sql/test/generator/IndexColumn.scala | 4 +- .../spark/sql/test/generator/Schema.scala | 1 + .../test/generator/TestDataGenerator.scala | 20 +-- .../tikv/allocator/RowIDAllocator.java | 1 + .../tikv/region/RegionStoreClient.java | 1 + 9 files changed, 309 insertions(+), 96 deletions(-) create mode 100644 core/src/test/scala/org/apache/spark/sql/insertion/BatchWriteUniqueIndexSuite.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/insertion/EnumerationUniqueIndexDataTypeTestAction.scala rename core/src/test/scala/org/apache/spark/sql/test/generator/{ValueGenerator.scala => ColumnValueGenerator.scala} (69%) diff --git a/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala b/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala index ffa3b86471..42e033979b 100644 --- a/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala +++ b/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala @@ -39,19 +39,29 @@ class BaseDataSourceTest(val table: String, TiSession.clearCache() } - protected def tidbWrite(rows: List[Row], - schema: StructType, - param: Option[Map[String, String]] = None): Unit = { + protected def dropTable(tblName: String): Unit = { + jdbcUpdate(s"drop table if exists `$database`.`$tblName`") + } + + protected def tidbWriteWithTable(rows: List[Row], + schema: StructType, + tblName: String, + param: Option[Map[String, String]] = None): Unit = { val data: RDD[Row] = sc.makeRDD(rows) val df = sqlContext.createDataFrame(data, schema) df.write .format("tidb") .options(tidbOptions ++ param.getOrElse(Map.empty)) .option("database", database) - .option("table", table) + .option("table", tblName) .mode("append") .save() } + protected def tidbWrite(rows: List[Row], + schema: StructType, + param: Option[Map[String, String]] = None): Unit = { + tidbWriteWithTable(rows, schema, table, param) + } protected def jdbcWrite(rows: List[Row], schema: StructType, @@ -67,23 +77,31 @@ class BaseDataSourceTest(val table: String, .save() } - protected def testTiDBSelect(expectedAnswer: Seq[Row], - sortCol: String = "i", - selectCol: String = null): Unit = { + protected def testTiDBSelectWithTable( + expectedAnswer: Seq[Row], + sortCol: String = "i", + selectCol: String = null, + tableName: String + ) = { // check data source result & expected answer - var df = queryDatasourceTiDB(sortCol) + var df = queryDatasourceTiDBWithTable(sortCol, tableName) if (selectCol != null) { df = df.select(selectCol) } checkAnswer(df, expectedAnswer) // check table scan - var df2 = queryDatasourceTableScan(sortCol) + var df2 = queryDatasourceTableScanWithTable(sortCol, tableName) if (selectCol != null) { df2 = df2.select(selectCol) } checkAnswer(df2, expectedAnswer) } + protected def testTiDBSelect(expectedAnswer: Seq[Row], + sortCol: String = "i", + selectCol: String = null): Unit = { + testTiDBSelectWithTable(expectedAnswer, sortCol, selectCol, table) + } protected def compareTiDBWriteFailureWithJDBC( data: List[Row], @@ -175,17 +193,26 @@ class BaseDataSourceTest(val table: String, } - protected def compareTiDBSelectWithJDBC_V2(sortCol: String = "i"): Unit = { - val sql = s"select * from $dbtable order by $sortCol" + protected def compareTiDBSelectWithJDBCWithTable_V2(tblName: String, + sortCol: String = "i"): Unit = { + val sql = s"select * from `$database`.`$tblName` order by $sortCol" // check jdbc result & data source result val jdbcResult = queryTiDBViaJDBC(sql) - val df = queryDatasourceTiDB(sortCol) + val df = queryDatasourceTiDBWithTable(sortCol, tableName = tblName) val tidbResult = seqRowToList(df.collect(), df.schema) - assert( - compSqlResult(sql, jdbcResult, tidbResult, checkLimit = false) - ) + if (compResult(jdbcResult, tidbResult)) { + assert(true) + } else { + println(s"failed on $tblName") + println(tidbResult) + assert(false) + } + } + + protected def compareTiDBSelectWithJDBC_V2(sortCol: String = "i"): Unit = { + compareTiDBSelectWithJDBCWithTable_V2(table, sortCol) } private def seqRowToList(rows: Seq[Row], schema: StructType): List[List[Any]] = @@ -203,26 +230,33 @@ class BaseDataSourceTest(val table: String, }) .toList - protected def queryDatasourceTableScan(sortCol: String): DataFrame = { + protected def queryDatasourceTableScanWithTable(sortCol: String, tblName: String): DataFrame = { sqlContext.read .format("tidb") .options(tidbOptions) .option("database", database) - .option("table", table) + .option("table", tblName) .option(TiConfigConst.ALLOW_INDEX_READ, "false") .load() .sort(sortCol) } - protected def queryDatasourceTiDB(sortCol: String): DataFrame = + protected def queryDatasourceTableScan(sortCol: String): DataFrame = { + queryDatasourceTableScanWithTable(sortCol, table) + } + + protected def queryDatasourceTiDBWithTable(sortCol: String, tableName: String): DataFrame = sqlContext.read .format("tidb") .options(tidbOptions) .option("database", database) - .option("table", table) + .option("table", tableName) .load() .sort(sortCol) + protected def queryDatasourceTiDB(sortCol: String): DataFrame = + queryDatasourceTiDBWithTable(sortCol, table) + protected def testTiDBSelectFilter(filter: String, expectedAnswer: Seq[Row]): Unit = { val loadedDf = sqlContext.read .format("tidb") diff --git a/core/src/test/scala/org/apache/spark/sql/insertion/BatchWriteUniqueIndexSuite.scala b/core/src/test/scala/org/apache/spark/sql/insertion/BatchWriteUniqueIndexSuite.scala new file mode 100644 index 0000000000..53cb09f5ab --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/insertion/BatchWriteUniqueIndexSuite.scala @@ -0,0 +1,80 @@ +package org.apache.spark.sql.insertion + +import com.pingcap.tikv.meta.TiColumnInfo +import com.pingcap.tispark.datasource.BaseDataSourceTest +import com.pingcap.tispark.utils.TiUtil +import org.apache.spark.sql.Row +import org.apache.spark.sql.test.generator.DataType.ReflectedDataType +import org.apache.spark.sql.test.generator.Schema +import org.apache.spark.sql.test.generator.TestDataGenerator._ + +class BatchWriteUniqueIndexSuite + extends BaseDataSourceTest("batch_write_insertion_one_unique_index", "batch_write_test_index") + with EnumerationUniqueIndexDataTypeTestAction { + // TODO: support binary insertion. + override val dataTypes: List[ReflectedDataType] = integers ::: decimals ::: doubles ::: charCharset + override val unsignedDataTypes: List[ReflectedDataType] = integers ::: decimals ::: doubles + override val dataTypeTestDir = "batch-write-test-index" + override val database = "batch_write_test_index" + override val testDesc = "Test for single PK column and multiple unique index type" + + override def beforeAll(): Unit = { + super.beforeAll() + tidbStmt.execute(s"drop database if exists $database") + tidbStmt.execute(s"create database $database") + } + + private def tiRowToSparkRow(row: TiRow, tiColsInfos: java.util.List[TiColumnInfo]) = { + val sparkRow = new Array[Any](row.fieldCount()) + for (i <- 0 until row.fieldCount()) { + val colTp = tiColsInfos.get(i).getType + val colVal = row.get(i, colTp) + sparkRow(i) = colVal + } + Row.fromSeq(sparkRow) + } + + private def dropAndCreateTbl(schema: Schema): Unit = { + // drop table if exits + dropTable(schema.tableName) + + // create table in tidb first + jdbcUpdate(schema.toString) + } + + private def insertAndSelect(schema: Schema): Unit = { + val tblName = schema.tableName + + val tiTblInfo = getTableInfo(database, tblName) + val tiColInfos = tiTblInfo.getColumns + // gen data + val rows = + generateRandomRows(schema, rowCount, r).map(row => tiRowToSparkRow(row, tiColInfos)) + // insert data to tikv + tidbWriteWithTable(rows, TiUtil.getSchemaFromTable(tiTblInfo), tblName) + // select data from tikv and compare with tidb + compareTiDBSelectWithJDBCWithTable_V2(tblName = tblName, "col_bigint") + } + + test("test unique indices cases") { + val schemas = genSchema(dataTypes, table) + + schemas.foreach { schema => + dropAndCreateTbl(schema) + } + + schemas.foreach { schema => + insertAndSelect(schema) + } + } + + // this is only for + override def test(): Unit = {} + + override def afterAll(): Unit = + try { + dropTable() + } finally { + super.afterAll() + } +} diff --git a/core/src/test/scala/org/apache/spark/sql/insertion/EnumerationUniqueIndexDataTypeTestAction.scala b/core/src/test/scala/org/apache/spark/sql/insertion/EnumerationUniqueIndexDataTypeTestAction.scala new file mode 100644 index 0000000000..262ca5fe36 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/insertion/EnumerationUniqueIndexDataTypeTestAction.scala @@ -0,0 +1,88 @@ +package org.apache.spark.sql.insertion + +import org.apache.commons.math3.util.Combinations +import org.apache.spark.sql.BaseTestGenerationSpec +import org.apache.spark.sql.test.generator.DataType.{getBaseType, DECIMAL, ReflectedDataType} +import org.apache.spark.sql.test.generator.TestDataGenerator.{getDecimal, getLength, isCharOrBinary, isNumeric, isStringType, isVarString, schemaGenerator} +import org.apache.spark.sql.test.generator._ +import org.apache.spark.sql.types.MultiColumnDataTypeTestSpec + +import scala.util.Random + +trait EnumerationUniqueIndexDataTypeTestAction + extends MultiColumnDataTypeTestSpec + with BaseTestGenerationSpec { + private def genIndex(dataTypes: List[ReflectedDataType], r: Random): List[Index] = { + val size = dataTypes.length + // the first step is generate all possible keys + val keyList = scala.collection.mutable.ListBuffer.empty[Key] + for (i <- 1 until 3) { + val combination = new Combinations(size, i) + //(i, size) + val iterator = combination.iterator() + while (iterator.hasNext) { + val intArray = iterator.next() + val indexColumnList = scala.collection.mutable.ListBuffer.empty[IndexColumn] + // index may have multiple column + for (j <- 0 until intArray.length) { + // we add extra one to the column id since 1 is reserved to primary key + if (isStringType(dataTypes(intArray(j)))) { + indexColumnList += PrefixColumn(intArray(j) + 1, r.nextInt(4) + 2) + } else { + indexColumnList += DefaultColumn(intArray(j) + 1) + } + } + + keyList += Key(indexColumnList.toList) + } + } + + keyList.toList + } + + def genLen(dataType: ReflectedDataType): String = { + val baseType = getBaseType(dataType) + val length = getLength(baseType) + dataType match { + case DECIMAL => s"$length,${getDecimal(baseType)}" + case _ if isVarString(dataType) => s"$length" + case _ if isCharOrBinary(dataType) => "10" + case _ => "" + } + } + + // this only generate schema with one unique index + def genSchema(dataTypes: List[ReflectedDataType], tablePrefix: String): List[Schema] = { + val indices = genIndex(dataTypes, r) + + val dataTypesWithDescription = dataTypes.map { dataType => + val len = genLen(dataType) + if (isNumeric(dataType)) { + (dataType, len, "not null") + } else { + (dataType, len, "") + } + } + + indices.zipWithIndex.map { index => + schemaGenerator( + database, + tablePrefix + index._2, + r, + dataTypesWithDescription, + List(index._1) + ) + } + } + + private def toString(dataTypes: Seq[String]): String = dataTypes.hashCode().toString + + override val rowCount = 10 + + override def getTableName(dataTypes: String*): String = s"test_${toString(dataTypes)}" + + override def getTableNameWithDesc(desc: String, dataTypes: String*): String = + s"test_${desc}_${toString(dataTypes)}" + + override def getIndexName(dataTypes: String*): String = s"idx_${toString(dataTypes)}" +} diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/ValueGenerator.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/ColumnValueGenerator.scala similarity index 69% rename from core/src/test/scala/org/apache/spark/sql/test/generator/ValueGenerator.scala rename to core/src/test/scala/org/apache/spark/sql/test/generator/ColumnValueGenerator.scala index 3c7a16027e..3043de16c6 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/ValueGenerator.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/ColumnValueGenerator.scala @@ -23,15 +23,15 @@ import org.apache.spark.sql.test.generator.TestDataGenerator.{checkUnique, getLe import scala.collection.mutable import scala.util.Random -case class ValueGenerator(dataType: ReflectedDataType, - M: Long = -1, - D: Int = -1, - nullable: Boolean = true, - isUnsigned: Boolean = false, - noDefault: Boolean = false, - default: Any = null, - isPrimaryKey: Boolean = false, - isUnique: Boolean = false) { +case class ColumnValueGenerator(dataType: ReflectedDataType, + M: Long = -1, + D: Int = -1, + nullable: Boolean = true, + isUnsigned: Boolean = false, + noDefault: Boolean = false, + default: Any = null, + isPrimaryKey: Boolean = false, + isUnique: Boolean = false) { private val flag: Int = { import com.pingcap.tikv.types.DataType._ @@ -137,65 +137,73 @@ case class ValueGenerator(dataType: ReflectedDataType, } } + def randomUnsignedValue(r: Random): Any = { + if (!isNumeric(dataType)) { + throw new IllegalArgumentException("unsigned type is not numeric") + } + dataType match { + case BIT => + val bit: Array[Boolean] = new Array[Boolean](tiDataType.getLength.toInt) + bit.map(_ => r.nextBoolean) + case BOOLEAN => r.nextInt(1 << 1) + case TINYINT => r.nextInt(1 << 8) + case SMALLINT => r.nextInt(1 << 16) + case MEDIUMINT => r.nextInt(1 << 24) + case INT => r.nextInt + (1L << 31) + case BIGINT => toUnsignedBigInt(r.nextLong) + case FLOAT => Math.abs(r.nextFloat) + case DOUBLE => Math.abs(r.nextDouble) + case DECIMAL => + val len = getLength(tiDataType) + val decimal = if (tiDataType.isDecimalUnSpecified) 0 else tiDataType.getDecimal + (BigDecimal.apply(Math.abs(r.nextLong()) % Math.pow(10, len)) / BigDecimal.apply( + Math.pow(10, decimal) + )).bigDecimal + } + } + + def randomSignedValue(r: Random): Any = { + dataType match { + case BIT => + val bit: Array[Boolean] = new Array[Boolean](tiDataType.getLength.toInt) + bit.map(_ => r.nextBoolean) + case BOOLEAN => r.nextInt(1 << 1) + case TINYINT => (r.nextInt(1 << 8) - (1 << 7)).longValue() + case SMALLINT => (r.nextInt(1 << 16) - (1 << 15)).longValue() + case MEDIUMINT => (r.nextInt(1 << 24) - (1 << 23)).longValue() + case INT => r.nextInt.longValue + case BIGINT => r.nextLong + case FLOAT => r.nextFloat.doubleValue + case DOUBLE => r.nextDouble + case DECIMAL => + val len = getLength(tiDataType) + val decimal = if (tiDataType.isDecimalUnSpecified) 0 else tiDataType.getDecimal + (BigDecimal.apply(r.nextLong % Math.pow(10, len)) / BigDecimal.apply( + Math.pow(10, decimal) + )).bigDecimal + case VARCHAR => generateRandomString(r, tiDataType.getLength) + case VARBINARY => generateRandomBinary(r, tiDataType.getLength) + case CHAR | TEXT | TINYTEXT | MEDIUMTEXT | LONGTEXT => + generateRandomString(r, getRandomLength(dataType, r)) + case BINARY | BLOB | TINYBLOB | MEDIUMBLOB | LONGBLOB => + generateRandomBinary(r, getRandomLength(dataType, r)) + case DATE => + // start from 1000-01-01 to 9999-01-01 + val milliseconds = -30610253143000L + (Math.abs(r.nextLong) % (9000L * 365 * 24 * 60 * 60 * 1000)) + new java.sql.Date(milliseconds) + case TIMESTAMP => + // start from 1970-01-01 00:00:01 to 2038-01-19 03:14:07 + val milliseconds = Math.abs(r.nextInt * 1000L + 1000L) + Math.abs(r.nextInt(1000)) + new java.sql.Timestamp(milliseconds) + case _ => throw new RuntimeException(s"random $dataType generator not supported yet") + } + } + def randomValue(r: Random): Any = { if (tiDataType.isUnsigned) { - if (!isNumeric(dataType)) { - throw new IllegalArgumentException("unsigned type is not numeric") - } - dataType match { - case BIT => - val bit: Array[Boolean] = new Array[Boolean](tiDataType.getLength.toInt) - bit.map(_ => r.nextBoolean) - case BOOLEAN => r.nextInt(1 << 1) - case TINYINT => r.nextInt(1 << 8) - case SMALLINT => r.nextInt(1 << 16) - case MEDIUMINT => r.nextInt(1 << 24) - case INT => r.nextInt + (1L << 31) - case BIGINT => toUnsignedBigInt(r.nextLong) - case FLOAT => Math.abs(r.nextFloat) - case DOUBLE => Math.abs(r.nextDouble) - case DECIMAL => - val len = getLength(tiDataType) - val decimal = if (tiDataType.isDecimalUnSpecified) 0 else tiDataType.getDecimal - (BigDecimal.apply(Math.abs(r.nextLong()) % Math.pow(10, len)) / BigDecimal.apply( - Math.pow(10, decimal) - )).bigDecimal - } + randomUnsignedValue(r) } else { - dataType match { - case BIT => - val bit: Array[Boolean] = new Array[Boolean](tiDataType.getLength.toInt) - bit.map(_ => r.nextBoolean) - case BOOLEAN => r.nextInt(1 << 1) - case TINYINT => r.nextInt(1 << 8) - (1 << 7) - case SMALLINT => r.nextInt(1 << 16) - (1 << 15) - case MEDIUMINT => r.nextInt(1 << 24) - (1 << 23) - case INT => r.nextInt - case BIGINT => r.nextLong - case FLOAT => r.nextFloat - case DOUBLE => r.nextDouble - case DECIMAL => - val len = getLength(tiDataType) - val decimal = if (tiDataType.isDecimalUnSpecified) 0 else tiDataType.getDecimal - (BigDecimal.apply(r.nextLong % Math.pow(10, len)) / BigDecimal.apply( - Math.pow(10, decimal) - )).bigDecimal - case VARCHAR => generateRandomString(r, tiDataType.getLength) - case VARBINARY => generateRandomBinary(r, tiDataType.getLength) - case CHAR | TEXT | TINYTEXT | MEDIUMTEXT | LONGTEXT => - generateRandomString(r, getRandomLength(dataType, r)) - case BINARY | BLOB | TINYBLOB | MEDIUMBLOB | LONGBLOB => - generateRandomBinary(r, getRandomLength(dataType, r)) - case DATE => - // start from 1000-01-01 to 9999-01-01 - val milliseconds = -30610253143000L + (Math.abs(r.nextLong) % (9000L * 365 * 24 * 60 * 60 * 1000)) - new java.sql.Date(milliseconds) - case TIMESTAMP => - // start from 1970-01-01 00:00:01 to 2038-01-19 03:14:07 - val milliseconds = Math.abs(r.nextInt * 1000L + 1000L) + Math.abs(r.nextInt(1000)) - new java.sql.Timestamp(milliseconds) - case _ => throw new RuntimeException(s"random $dataType generator not supported yet") - } + randomSignedValue(r) } } diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/IndexColumn.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/IndexColumn.scala index c629d70728..02249efe32 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/IndexColumn.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/IndexColumn.scala @@ -77,8 +77,8 @@ case class ColumnInfo(columnName: String, } } - val generator: ValueGenerator = - ValueGenerator( + val generator: ColumnValueGenerator = + ColumnValueGenerator( dataType, len, decimal, diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/Schema.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/Schema.scala index 2081cac437..1ea51ab555 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/Schema.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/Schema.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.test.generator +import com.pingcap.tispark.utils.{TiConverter, TiUtil} import org.apache.spark.sql.test.generator.DataType._ /** diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/TestDataGenerator.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/TestDataGenerator.scala index 1622e5de9b..ca0044b07f 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/TestDataGenerator.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/TestDataGenerator.scala @@ -86,8 +86,8 @@ object TestDataGenerator { // def isBits(dataType: DataType): Boolean = bits.contains(dataType) // def isBooleans(dataType: DataType): Boolean = booleans.contains(dataType) // def isIntegers(dataType: DataType): Boolean = integers.contains(dataType) - // def isDecimals(dataType: DataType): Boolean = decimals.contains(dataType) - // def isDoubles(dataType: DataType): Boolean = doubles.contains(dataType) + def isDecimals(dataType: ReflectedDataType): Boolean = decimals.contains(dataType) + def isDoubles(dataType: ReflectedDataType): Boolean = doubles.contains(dataType) // def isTimestamps(dataType: DataType): Boolean = timestamps.contains(dataType) // def isDates(dataType: DataType): Boolean = dates.contains(dataType) // def isDurations(dataType: DataType): Boolean = durations.contains(dataType) @@ -270,15 +270,15 @@ object TestDataGenerator { Schema(database, table, columnNames, columnDesc.toMap, idxColumns) } - private def generateRandomValue(row: TiRow, - offset: Int, - r: Random, - valueGenerator: ValueGenerator): Unit = { - val value = valueGenerator.next(r) + private def generateRandomColValue(row: TiRow, + offset: Int, + r: Random, + colValueGenerator: ColumnValueGenerator): Unit = { + val value = colValueGenerator.next(r) if (value == null) { row.setNull(offset) } else { - row.set(offset, valueGenerator.tiDataType, value) + row.set(offset, colValueGenerator.tiDataType, value) } } @@ -319,7 +319,7 @@ object TestDataGenerator { while (true) { for (i <- schema.columnInfo.indices) { val columnInfo = schema.columnInfo(i) - generateRandomValue(row, i, r, columnInfo.generator) + generateRandomColValue(row, i, r, columnInfo.generator) } if (pkOffset.nonEmpty) { val value = pkOffset.map { i => @@ -335,7 +335,7 @@ object TestDataGenerator { throw new RuntimeException("Inaccessible") } - private def generateRandomRows(schema: Schema, n: Long, r: Random): List[TiRow] = { + def generateRandomRows(schema: Schema, n: Long, r: Random): List[TiRow] = { val set: mutable.Set[Any] = mutable.HashSet.empty[Any] // offset of pk columns val pkOffset: List[Int] = { diff --git a/tikv-client/src/main/java/com/pingcap/tikv/allocator/RowIDAllocator.java b/tikv-client/src/main/java/com/pingcap/tikv/allocator/RowIDAllocator.java index 325165b016..b89f533aa9 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/allocator/RowIDAllocator.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/allocator/RowIDAllocator.java @@ -53,6 +53,7 @@ public static RowIDAllocator create( } else { allocator.initSigned(TiSession.getInstance(conf).createSnapshot(), tableId); } + return allocator; } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java b/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java index 9f17955679..612e387236 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java @@ -87,6 +87,7 @@ public TiRegion getRegion() { private boolean checkLockError(BackOffer backOffer, KeyError error) { if (error.hasLocked()) { Lock lock = new Lock(error.getLocked()); + logger.warn("resolving lock"); boolean ok = lockResolverClient.resolveLocks( backOffer, new ArrayList<>(Collections.singletonList(lock))); From 95f96988ac0b9b4aada2501a7db0ad8b7b91c365 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Mon, 19 Aug 2019 13:44:10 +0800 Subject: [PATCH 46/62] use one unique seed (#1043) --- .../apache/spark/sql/TiSparkTestSpec.scala | 2 +- .../spark/sql/test/SharedSQLContext.scala | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/sql/TiSparkTestSpec.scala b/core/src/test/scala/org/apache/spark/sql/TiSparkTestSpec.scala index 1115c47c4f..a7b840fcc2 100644 --- a/core/src/test/scala/org/apache/spark/sql/TiSparkTestSpec.scala +++ b/core/src/test/scala/org/apache/spark/sql/TiSparkTestSpec.scala @@ -25,7 +25,7 @@ trait TiSparkTestSpec extends SharedSQLContext { val database: String val testDesc: String // Randomizer for tests - val r: Random = new Random(generateDataSeed) + lazy val r: Random = new Random(generateDataSeed) def test(): Unit } diff --git a/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala b/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala index 0604780a36..70dfbae992 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala @@ -103,7 +103,7 @@ trait SharedSQLContext extends SparkFunSuite with Eventually with BeforeAndAfter protected def generateData: Boolean = SharedSQLContext.generateData - protected def generateDataSeed: Long = SharedSQLContext.generateDataSeed + protected def generateDataSeed: Long = SharedSQLContext.generateDataSeed.get /** * The [[TestSparkSession]] to use for all tests in this suite. @@ -165,7 +165,7 @@ object SharedSQLContext extends Logging { protected var tidbPort: Int = _ protected var pdAddresses: String = _ protected var generateData: Boolean = _ - protected var generateDataSeed: Long = _ + protected var generateDataSeed: Option[Long] = None protected implicit def spark: SparkSession = _spark @@ -394,13 +394,16 @@ object SharedSQLContext extends Logging { generateData = getOrElse(_tidbConf, SHOULD_GENERATE_DATA, "true").toLowerCase.toBoolean - generateDataSeed = getOrElse(_tidbConf, GENERATE_DATA_SEED, "1234").toLong - if (generateDataSeed == 0) { - generateDataSeed = System.currentTimeMillis() - } + if (generateDataSeed.isEmpty) { + var tmpSeed = getOrElse(_tidbConf, GENERATE_DATA_SEED, "1234").toLong + if (tmpSeed == 0) { + tmpSeed = System.currentTimeMillis() + } + generateDataSeed = Some(tmpSeed) - if (generateData) { - logger.info(s"generate data is enabled and seed is $generateDataSeed") + if (generateData) { + logger.info(s"generate data is enabled and seed is ${generateDataSeed.get}") + } } if (isTidbConfigPropertiesInjectedToSparkEnabled) { From ce9f81879a4014afc492ef70bb58523c5e5decfc Mon Sep 17 00:00:00 2001 From: birdstorm Date: Mon, 19 Aug 2019 14:03:36 +0800 Subject: [PATCH 47/62] remove unused code (#1030) --- .../main/java/com/pingcap/tikv/PDClient.java | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/tikv-client/src/main/java/com/pingcap/tikv/PDClient.java b/tikv-client/src/main/java/com/pingcap/tikv/PDClient.java index cfdb72aaf9..2b85f8f6cd 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/PDClient.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/PDClient.java @@ -27,13 +27,11 @@ import com.pingcap.tikv.exception.GrpcException; import com.pingcap.tikv.exception.TiClientInternalException; import com.pingcap.tikv.meta.TiTimestamp; -import com.pingcap.tikv.operation.NoopHandler; import com.pingcap.tikv.operation.PDErrorHandler; import com.pingcap.tikv.pd.PDUtils; import com.pingcap.tikv.region.TiRegion; import com.pingcap.tikv.util.BackOffer; import com.pingcap.tikv.util.ChannelFactory; -import com.pingcap.tikv.util.ConcreteBackOffer; import com.pingcap.tikv.util.FutureObserver; import io.grpc.ManagedChannel; import java.net.URI; @@ -60,23 +58,6 @@ public class PDClient extends AbstractGRPCClient private ScheduledExecutorService service; private List pdAddrs; - /** - * get operator associated with the specific region. - * - * @param regionId is used to locate specific region. - * @return - */ - private GetOperatorResponse getOperator(long regionId) { - Supplier request = - () -> GetOperatorRequest.newBuilder().setHeader(header).setRegionId(regionId).build(); - // get operator no need to handle error and no need back offer. - return callWithRetry( - ConcreteBackOffer.newCustomBackOff(0), - PDGrpc.METHOD_GET_OPERATOR, - request, - new NoopHandler<>()); - } - @Override public TiTimestamp getTimestamp(BackOffer backOffer) { Supplier request = () -> tsoReq; From 1cc6dc6aec8917a9d223f9b071a57523cc518352 Mon Sep 17 00:00:00 2001 From: Zhexuan Yang Date: Mon, 19 Aug 2019 18:59:49 +0800 Subject: [PATCH 48/62] adding batch write pk insertion test (#1044) --- .../datasource/BaseDataSourceTest.scala | 1 + ...a => BaseEnumerateDataTypesTestSpec.scala} | 35 +------- .../sql/insertion/BatchWritePkSuite.scala | 80 +++++++++++++++++++ .../BatchWriteUniqueIndexSuite.scala | 2 +- .../EnumeratePKDataTypeTestAction.scala | 24 ++++++ ...umerateUniqueIndexDataTypeTestAction.scala | 40 ++++++++++ 6 files changed, 150 insertions(+), 32 deletions(-) rename core/src/test/scala/org/apache/spark/sql/insertion/{EnumerationUniqueIndexDataTypeTestAction.scala => BaseEnumerateDataTypesTestSpec.scala} (57%) create mode 100644 core/src/test/scala/org/apache/spark/sql/insertion/BatchWritePkSuite.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/insertion/EnumeratePKDataTypeTestAction.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/insertion/EnumerateUniqueIndexDataTypeTestAction.scala diff --git a/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala b/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala index 42e033979b..d9730352cf 100644 --- a/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala +++ b/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala @@ -202,6 +202,7 @@ class BaseDataSourceTest(val table: String, val df = queryDatasourceTiDBWithTable(sortCol, tableName = tblName) val tidbResult = seqRowToList(df.collect(), df.schema) + println(s"running test on table $tblName") if (compResult(jdbcResult, tidbResult)) { assert(true) } else { diff --git a/core/src/test/scala/org/apache/spark/sql/insertion/EnumerationUniqueIndexDataTypeTestAction.scala b/core/src/test/scala/org/apache/spark/sql/insertion/BaseEnumerateDataTypesTestSpec.scala similarity index 57% rename from core/src/test/scala/org/apache/spark/sql/insertion/EnumerationUniqueIndexDataTypeTestAction.scala rename to core/src/test/scala/org/apache/spark/sql/insertion/BaseEnumerateDataTypesTestSpec.scala index 262ca5fe36..db399001c1 100644 --- a/core/src/test/scala/org/apache/spark/sql/insertion/EnumerationUniqueIndexDataTypeTestAction.scala +++ b/core/src/test/scala/org/apache/spark/sql/insertion/BaseEnumerateDataTypesTestSpec.scala @@ -1,44 +1,17 @@ package org.apache.spark.sql.insertion -import org.apache.commons.math3.util.Combinations import org.apache.spark.sql.BaseTestGenerationSpec import org.apache.spark.sql.test.generator.DataType.{getBaseType, DECIMAL, ReflectedDataType} -import org.apache.spark.sql.test.generator.TestDataGenerator.{getDecimal, getLength, isCharOrBinary, isNumeric, isStringType, isVarString, schemaGenerator} -import org.apache.spark.sql.test.generator._ +import org.apache.spark.sql.test.generator.TestDataGenerator._ +import org.apache.spark.sql.test.generator.{Index, Schema} import org.apache.spark.sql.types.MultiColumnDataTypeTestSpec import scala.util.Random -trait EnumerationUniqueIndexDataTypeTestAction +trait BaseEnumerateDataTypesTestSpec extends MultiColumnDataTypeTestSpec with BaseTestGenerationSpec { - private def genIndex(dataTypes: List[ReflectedDataType], r: Random): List[Index] = { - val size = dataTypes.length - // the first step is generate all possible keys - val keyList = scala.collection.mutable.ListBuffer.empty[Key] - for (i <- 1 until 3) { - val combination = new Combinations(size, i) - //(i, size) - val iterator = combination.iterator() - while (iterator.hasNext) { - val intArray = iterator.next() - val indexColumnList = scala.collection.mutable.ListBuffer.empty[IndexColumn] - // index may have multiple column - for (j <- 0 until intArray.length) { - // we add extra one to the column id since 1 is reserved to primary key - if (isStringType(dataTypes(intArray(j)))) { - indexColumnList += PrefixColumn(intArray(j) + 1, r.nextInt(4) + 2) - } else { - indexColumnList += DefaultColumn(intArray(j) + 1) - } - } - - keyList += Key(indexColumnList.toList) - } - } - - keyList.toList - } + def genIndex(dataTypes: List[ReflectedDataType], r: Random): List[Index] def genLen(dataType: ReflectedDataType): String = { val baseType = getBaseType(dataType) diff --git a/core/src/test/scala/org/apache/spark/sql/insertion/BatchWritePkSuite.scala b/core/src/test/scala/org/apache/spark/sql/insertion/BatchWritePkSuite.scala new file mode 100644 index 0000000000..4500aa1bdd --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/insertion/BatchWritePkSuite.scala @@ -0,0 +1,80 @@ +package org.apache.spark.sql.insertion + +import com.pingcap.tikv.meta.TiColumnInfo +import com.pingcap.tispark.datasource.BaseDataSourceTest +import com.pingcap.tispark.utils.TiUtil +import org.apache.spark.sql.Row +import org.apache.spark.sql.test.generator.DataType.ReflectedDataType +import org.apache.spark.sql.test.generator.Schema +import org.apache.spark.sql.test.generator.TestDataGenerator._ + +class BatchWritePkSuite + extends BaseDataSourceTest("batch_write_insertion_pk", "batch_write_test_pk") + with EnumeratePKDataTypeTestAction { + // TODO: support binary insertion. + override val dataTypes: List[ReflectedDataType] = integers ::: decimals ::: doubles ::: charCharset + override val unsignedDataTypes: List[ReflectedDataType] = integers ::: decimals ::: doubles + override val dataTypeTestDir = "batch-write-test-index" + override val database = "batch_write_test_index" + override val testDesc = "Test for single PK column and multiple unique index type" + + override def beforeAll(): Unit = { + super.beforeAll() + tidbStmt.execute(s"drop database if exists $database") + tidbStmt.execute(s"create database $database") + } + + private def tiRowToSparkRow(row: TiRow, tiColsInfos: java.util.List[TiColumnInfo]) = { + val sparkRow = new Array[Any](row.fieldCount()) + for (i <- 0 until row.fieldCount()) { + val colTp = tiColsInfos.get(i).getType + val colVal = row.get(i, colTp) + sparkRow(i) = colVal + } + Row.fromSeq(sparkRow) + } + + private def dropAndCreateTbl(schema: Schema): Unit = { + // drop table if exits + dropTable(schema.tableName) + + // create table in tidb first + jdbcUpdate(schema.toString) + } + + private def insertAndSelect(schema: Schema): Unit = { + val tblName = schema.tableName + + val tiTblInfo = getTableInfo(database, tblName) + val tiColInfos = tiTblInfo.getColumns + // gen data + val rows = + generateRandomRows(schema, rowCount, r).map(row => tiRowToSparkRow(row, tiColInfos)) + // insert data to tikv + tidbWriteWithTable(rows, TiUtil.getSchemaFromTable(tiTblInfo), tblName) + // select data from tikv and compare with tidb + compareTiDBSelectWithJDBCWithTable_V2(tblName = tblName, "col_bigint") + } + + test("test unique indices cases") { + val schemas = genSchema(dataTypes, table) + + schemas.foreach { schema => + dropAndCreateTbl(schema) + } + + schemas.foreach { schema => + insertAndSelect(schema) + } + } + + // this is only for + override def test(): Unit = {} + + override def afterAll(): Unit = + try { + dropTable() + } finally { + super.afterAll() + } +} diff --git a/core/src/test/scala/org/apache/spark/sql/insertion/BatchWriteUniqueIndexSuite.scala b/core/src/test/scala/org/apache/spark/sql/insertion/BatchWriteUniqueIndexSuite.scala index 53cb09f5ab..756381aeaf 100644 --- a/core/src/test/scala/org/apache/spark/sql/insertion/BatchWriteUniqueIndexSuite.scala +++ b/core/src/test/scala/org/apache/spark/sql/insertion/BatchWriteUniqueIndexSuite.scala @@ -10,7 +10,7 @@ import org.apache.spark.sql.test.generator.TestDataGenerator._ class BatchWriteUniqueIndexSuite extends BaseDataSourceTest("batch_write_insertion_one_unique_index", "batch_write_test_index") - with EnumerationUniqueIndexDataTypeTestAction { + with EnumerateUniqueIndexDataTypeTestAction { // TODO: support binary insertion. override val dataTypes: List[ReflectedDataType] = integers ::: decimals ::: doubles ::: charCharset override val unsignedDataTypes: List[ReflectedDataType] = integers ::: decimals ::: doubles diff --git a/core/src/test/scala/org/apache/spark/sql/insertion/EnumeratePKDataTypeTestAction.scala b/core/src/test/scala/org/apache/spark/sql/insertion/EnumeratePKDataTypeTestAction.scala new file mode 100644 index 0000000000..45842f1780 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/insertion/EnumeratePKDataTypeTestAction.scala @@ -0,0 +1,24 @@ +package org.apache.spark.sql.insertion + +import org.apache.spark.sql.test.generator.DataType.ReflectedDataType +import org.apache.spark.sql.test.generator.TestDataGenerator.isStringType +import org.apache.spark.sql.test.generator.{DefaultColumn, Index, PrefixColumn, PrimaryKey} + +import scala.util.Random + +trait EnumeratePKDataTypeTestAction extends BaseEnumerateDataTypesTestSpec { + override def genIndex(dataTypes: List[ReflectedDataType], r: Random): List[Index] = { + val size = dataTypes.length + val keyList = scala.collection.mutable.ListBuffer.empty[PrimaryKey] + for (i <- 0 until size) { + // we add extra one to the column id since 1 is reserved to primary key + val pkCol = if (isStringType(dataTypes(i))) { + PrefixColumn(i + 1, r.nextInt(4) + 2) :: Nil + } else { + DefaultColumn(i + 1) :: Nil + } + keyList += PrimaryKey(pkCol) + } + keyList.toList + } +} diff --git a/core/src/test/scala/org/apache/spark/sql/insertion/EnumerateUniqueIndexDataTypeTestAction.scala b/core/src/test/scala/org/apache/spark/sql/insertion/EnumerateUniqueIndexDataTypeTestAction.scala new file mode 100644 index 0000000000..e761c73a76 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/insertion/EnumerateUniqueIndexDataTypeTestAction.scala @@ -0,0 +1,40 @@ +package org.apache.spark.sql.insertion + +import org.apache.commons.math3.util.Combinations +import org.apache.spark.sql.BaseTestGenerationSpec +import org.apache.spark.sql.test.generator.DataType.{getBaseType, DECIMAL, ReflectedDataType} +import org.apache.spark.sql.test.generator.TestDataGenerator.{getDecimal, getLength, isCharOrBinary, isNumeric, isStringType, isVarString, schemaGenerator} +import org.apache.spark.sql.test.generator._ +import org.apache.spark.sql.types.MultiColumnDataTypeTestSpec + +import scala.util.Random + +trait EnumerateUniqueIndexDataTypeTestAction extends BaseEnumerateDataTypesTestSpec { + override def genIndex(dataTypes: List[ReflectedDataType], r: Random): List[Index] = { + val size = dataTypes.length + // the first step is generate all possible keys + val keyList = scala.collection.mutable.ListBuffer.empty[Key] + for (i <- 1 until 3) { + val combination = new Combinations(size, i) + //(i, size) + val iterator = combination.iterator() + while (iterator.hasNext) { + val intArray = iterator.next() + val indexColumnList = scala.collection.mutable.ListBuffer.empty[IndexColumn] + // index may have multiple column + for (j <- 0 until intArray.length) { + // we add extra one to the column id since 1 is reserved to primary key + if (isStringType(dataTypes(intArray(j)))) { + indexColumnList += PrefixColumn(intArray(j) + 1, r.nextInt(4) + 2) + } else { + indexColumnList += DefaultColumn(intArray(j) + 1) + } + } + + keyList += Key(indexColumnList.toList) + } + } + + keyList.toList + } +} From 0e430a9ad52e9401f49e2af169a5bae7df026431 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Tue, 20 Aug 2019 09:47:15 +0800 Subject: [PATCH 49/62] fix table not found bug in TiSession because of synchronization (#1041) --- .../main/java/com/pingcap/tikv/TiSession.java | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/tikv-client/src/main/java/com/pingcap/tikv/TiSession.java b/tikv-client/src/main/java/com/pingcap/tikv/TiSession.java index 46ee4f6d73..dfbc2f28a5 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/TiSession.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/TiSession.java @@ -47,15 +47,17 @@ public class TiSession implements AutoCloseable { // Since we create session as singleton now, configuration change will not // reflect change - public static synchronized TiSession getInstance(TiConfiguration conf) { - String key = conf.getPdAddrsString(); - if (sessionCachedMap.containsKey(key)) { - return sessionCachedMap.get(key); - } + public static TiSession getInstance(TiConfiguration conf) { + synchronized (sessionCachedMap) { + String key = conf.getPdAddrsString(); + if (sessionCachedMap.containsKey(key)) { + return sessionCachedMap.get(key); + } - TiSession newSession = new TiSession(conf); - sessionCachedMap.put(key, newSession); - return newSession; + TiSession newSession = new TiSession(conf); + sessionCachedMap.put(key, newSession); + return newSession; + } } private TiSession(TiConfiguration conf) { @@ -187,12 +189,17 @@ public void injectCallBackFunc(Function callBackFunc } public static void clearCache() { - TiSession.sessionCachedMap.clear(); + synchronized (sessionCachedMap) { + sessionCachedMap.clear(); + } } @Override public synchronized void close() throws Exception { - sessionCachedMap.remove(conf.getPdAddrsString()); + synchronized (sessionCachedMap) { + sessionCachedMap.remove(conf.getPdAddrsString()); + } + if (tableScanThreadPool != null) { tableScanThreadPool.shutdownNow(); } From deac3fe60436ccc56aa3941d9a1300189773c316 Mon Sep 17 00:00:00 2001 From: Zhexuan Yang Date: Tue, 20 Aug 2019 17:05:31 +0800 Subject: [PATCH 50/62] fix test failure (#1051) --- .../apache/spark/sql/test/generator/ColumnValueGenerator.scala | 2 +- .../apache/spark/sql/test/generator/TestDataGenerator.scala | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/ColumnValueGenerator.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/ColumnValueGenerator.scala index 3043de16c6..9cf36ecbb7 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/ColumnValueGenerator.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/ColumnValueGenerator.scala @@ -106,7 +106,7 @@ case class ColumnValueGenerator(dataType: ReflectedDataType, private val specialBound: List[Any] = { val list: List[Any] = dataType match { - case BIT => List(Array[Byte]()) + case BIT => List.empty[Array[Byte]] case TINYINT | SMALLINT | MEDIUMINT | INT | BIGINT if !tiDataType.isUnsigned => List(-1L) case TIMESTAMP => List(new java.sql.Timestamp(1000)) case _ if isCharCharset(dataType) => List("") diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/TestDataGenerator.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/TestDataGenerator.scala index ca0044b07f..874dd10226 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/TestDataGenerator.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/TestDataGenerator.scala @@ -20,12 +20,12 @@ package org.apache.spark.sql.test.generator import com.pingcap.tikv.row.ObjectRowImpl import org.apache.spark.SparkFunSuite import org.apache.spark.sql.test.generator.DataType._ +import org.slf4j.LoggerFactory import scala.collection.mutable import scala.util.Random object TestDataGenerator { - type TiRow = com.pingcap.tikv.row.Row val bits = List(BIT) @@ -284,6 +284,7 @@ object TestDataGenerator { def hash(value: Any): String = value match { case null => "null" + case b: Array[boolean] => b.mkString("[", ",", "]") case b: Array[Byte] => b.mkString("[", ",", "]") case t: java.sql.Timestamp => // timestamp was indexed as Integer when treated as unique key From bb7c6467a80343510cd5c0c7643bf41808109c52 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Wed, 21 Aug 2019 10:15:48 +0800 Subject: [PATCH 51/62] fix reflection bug: pass in different arguments for different version of same function (#1037) (#1052) (cherry picked from commit a5462c2dc107e4107abb6a2f59eb19565df8c28b) --- .../tispark/utils/ReflectionUtil.scala | 164 ++++++++++-------- 1 file changed, 96 insertions(+), 68 deletions(-) diff --git a/core/src/main/scala/com/pingcap/tispark/utils/ReflectionUtil.scala b/core/src/main/scala/com/pingcap/tispark/utils/ReflectionUtil.scala index 9e4522cd57..46bab8145a 100644 --- a/core/src/main/scala/com/pingcap/tispark/utils/ReflectionUtil.scala +++ b/core/src/main/scala/com/pingcap/tispark/utils/ReflectionUtil.scala @@ -59,86 +59,114 @@ object ReflectionUtil { // isOrderSensitive: Boolean = false): RDD[U] // // Hereby we use reflection to support different Spark versions. - private val mapPartitionsWithIndexInternal: Method = TiSparkInfo.SPARK_VERSION match { - case "2.3.0" | "2.3.1" => - tryLoadMethod( - "mapPartitionsWithIndexInternal", - mapPartitionsWithIndexInternalV1, - mapPartitionsWithIndexInternalV2 - ) - case _ => - // Spark version >= 2.3.2 - tryLoadMethod( - "mapPartitionsWithIndexInternal", - mapPartitionsWithIndexInternalV2, - mapPartitionsWithIndexInternalV1 - ) - } + case class ReflectionMapPartitionWithIndexInternal( + rdd: RDD[InternalRow], + internalRowToUnsafeRowWithIndex: (Int, Iterator[InternalRow]) => Iterator[UnsafeRow] + ) { + // Spark HDP Release may not compatible with official Release + // see https://github.com/pingcap/tispark/issues/1006 + def invoke(): RDD[InternalRow] = { + val (version, method) = TiSparkInfo.SPARK_VERSION match { + case "2.3.0" | "2.3.1" => + try { + reflectMapPartitionsWithIndexInternalV1(rdd, internalRowToUnsafeRowWithIndex) + } catch { + case _: Throwable => + try { + reflectMapPartitionsWithIndexInternalV2(rdd, internalRowToUnsafeRowWithIndex) + } catch { + case _: Throwable => + throw ScalaReflectionException( + s"Cannot find reflection of Method mapPartitionsWithIndexInternal, current Spark version is %s" + .format(TiSparkInfo.SPARK_VERSION) + ) + } + } + + case _ => + try { + reflectMapPartitionsWithIndexInternalV2(rdd, internalRowToUnsafeRowWithIndex) + } catch { + case _: Throwable => + try { + reflectMapPartitionsWithIndexInternalV1(rdd, internalRowToUnsafeRowWithIndex) + } catch { + case _: Throwable => + throw ScalaReflectionException( + s"Cannot find reflection of Method mapPartitionsWithIndexInternal, current Spark version is %s" + .format(TiSparkInfo.SPARK_VERSION) + ) + } + } + } - // Spark HDP Release may not compatible with official Release - // see https://github.com/pingcap/tispark/issues/1006 - private def tryLoadMethod(name: String, f1: () => Method, f2: () => Method): Method = { - try { - f1.apply() - } catch { - case _: Throwable => - try { - f2.apply() - } catch { - case _: Throwable => - throw ScalaReflectionException( - s"Cannot find reflection of Method $name, current Spark version is %s" - .format(TiSparkInfo.SPARK_VERSION) - ) - } + invokeMapPartitionsWithIndexInternal(version, method, rdd, internalRowToUnsafeRowWithIndex) } } // Spark-2.3.0 & Spark-2.3.1 - private def mapPartitionsWithIndexInternalV1(): Method = - classOf[RDD[InternalRow]].getDeclaredMethod( - "mapPartitionsWithIndexInternal", - classOf[(Int, Iterator[InternalRow]) => Iterator[UnsafeRow]], - classOf[Boolean], - classOf[ClassTag[UnsafeRow]] + private def reflectMapPartitionsWithIndexInternalV1( + rdd: RDD[InternalRow], + internalRowToUnsafeRowWithIndex: (Int, Iterator[InternalRow]) => Iterator[UnsafeRow] + ): (String, Method) = { + ( + "v1", + classOf[RDD[InternalRow]].getDeclaredMethod( + "mapPartitionsWithIndexInternal", + classOf[(Int, Iterator[InternalRow]) => Iterator[UnsafeRow]], + classOf[Boolean], + classOf[ClassTag[UnsafeRow]] + ) ) + } // >= Spark-2.3.2 - private def mapPartitionsWithIndexInternalV2(): Method = - classOf[RDD[InternalRow]].getDeclaredMethod( - "mapPartitionsWithIndexInternal", - classOf[(Int, Iterator[InternalRow]) => Iterator[UnsafeRow]], - classOf[Boolean], - classOf[Boolean], - classOf[ClassTag[UnsafeRow]] + private def reflectMapPartitionsWithIndexInternalV2( + rdd: RDD[InternalRow], + internalRowToUnsafeRowWithIndex: (Int, Iterator[InternalRow]) => Iterator[UnsafeRow] + ): (String, Method) = { + ( + "v2", + classOf[RDD[InternalRow]].getDeclaredMethod( + "mapPartitionsWithIndexInternal", + classOf[(Int, Iterator[InternalRow]) => Iterator[UnsafeRow]], + classOf[Boolean], + classOf[Boolean], + classOf[ClassTag[UnsafeRow]] + ) ) + } - case class ReflectionMapPartitionWithIndexInternal( + private def invokeMapPartitionsWithIndexInternal( + version: String, + method: Method, rdd: RDD[InternalRow], internalRowToUnsafeRowWithIndex: (Int, Iterator[InternalRow]) => Iterator[UnsafeRow] - ) { - def invoke(): RDD[InternalRow] = - TiSparkInfo.SPARK_VERSION match { - case "2.3.0" | "2.3.1" => - mapPartitionsWithIndexInternal - .invoke( - rdd, - internalRowToUnsafeRowWithIndex, - Boolean.box(false), - ClassTag.apply(classOf[UnsafeRow]) - ) - .asInstanceOf[RDD[InternalRow]] - case _ => - mapPartitionsWithIndexInternal - .invoke( - rdd, - internalRowToUnsafeRowWithIndex, - Boolean.box(false), - Boolean.box(false), - ClassTag.apply(classOf[UnsafeRow]) - ) - .asInstanceOf[RDD[InternalRow]] - } + ): RDD[InternalRow] = { + version match { + case "v1" => + // Spark-2.3.0 & Spark-2.3.1 + method + .invoke( + rdd, + internalRowToUnsafeRowWithIndex, + Boolean.box(false), + ClassTag.apply(classOf[UnsafeRow]) + ) + .asInstanceOf[RDD[InternalRow]] + + case _ => + // >= Spark-2.3.2 + method + .invoke( + rdd, + internalRowToUnsafeRowWithIndex, + Boolean.box(false), + Boolean.box(false), + ClassTag.apply(classOf[UnsafeRow]) + ) + .asInstanceOf[RDD[InternalRow]] + } } lazy val classLoader: URLClassLoader = { From 7e3b92d16afeff1f7dad11880711254f5fc915de Mon Sep 17 00:00:00 2001 From: Zhexuan Yang Date: Wed, 21 Aug 2019 15:08:32 +0800 Subject: [PATCH 52/62] Adding pk and unique index test for batch write (#1049) --- .../BaseEnumerateDataTypesTestSpec.scala | 16 ++-- .../BatchWritePKAndUniqueIndexSuite.scala | 82 +++++++++++++++++++ .../sql/insertion/BatchWritePkSuite.scala | 9 +- .../BatchWriteUniqueIndexSuite.scala | 5 +- ...tePKAndUniqueIndexDataTypeTestAction.scala | 68 +++++++++++++++ .../EnumeratePKDataTypeTestAction.scala | 6 +- ...umerateUniqueIndexDataTypeTestAction.scala | 6 +- .../spark/sql/types/DataTypeTestDir.scala | 5 ++ ...enerateMultiColumnDataTypeTestAction.scala | 3 +- .../types/MultiColumnDataTypeTestSpec.scala | 18 ---- .../RunMultiColumnDataTypeTestAction.scala | 2 +- .../pk/MultiColumnPKDataTypeSuites.scala | 2 +- 12 files changed, 180 insertions(+), 42 deletions(-) create mode 100644 core/src/test/scala/org/apache/spark/sql/insertion/BatchWritePKAndUniqueIndexSuite.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/insertion/EnumeratePKAndUniqueIndexDataTypeTestAction.scala create mode 100644 core/src/test/scala/org/apache/spark/sql/types/DataTypeTestDir.scala diff --git a/core/src/test/scala/org/apache/spark/sql/insertion/BaseEnumerateDataTypesTestSpec.scala b/core/src/test/scala/org/apache/spark/sql/insertion/BaseEnumerateDataTypesTestSpec.scala index db399001c1..a6ac12801b 100644 --- a/core/src/test/scala/org/apache/spark/sql/insertion/BaseEnumerateDataTypesTestSpec.scala +++ b/core/src/test/scala/org/apache/spark/sql/insertion/BaseEnumerateDataTypesTestSpec.scala @@ -11,7 +11,7 @@ import scala.util.Random trait BaseEnumerateDataTypesTestSpec extends MultiColumnDataTypeTestSpec with BaseTestGenerationSpec { - def genIndex(dataTypes: List[ReflectedDataType], r: Random): List[Index] + def genIndex(dataTypes: List[ReflectedDataType], r: Random): List[List[Index]] def genLen(dataType: ReflectedDataType): String = { val baseType = getBaseType(dataType) @@ -40,22 +40,24 @@ trait BaseEnumerateDataTypesTestSpec indices.zipWithIndex.map { index => schemaGenerator( database, + // table name tablePrefix + index._2, r, dataTypesWithDescription, - List(index._1) + // constraint + index._1 ) } } private def toString(dataTypes: Seq[String]): String = dataTypes.hashCode().toString - override val rowCount = 10 + override val rowCount = 50 - override def getTableName(dataTypes: String*): String = s"test_${toString(dataTypes)}" + // we are not using below function, we probably need decouple the logic. + override def getTableName(dataTypes: String*): String = ??? - override def getTableNameWithDesc(desc: String, dataTypes: String*): String = - s"test_${desc}_${toString(dataTypes)}" + override def getTableNameWithDesc(desc: String, dataTypes: String*): String = ??? - override def getIndexName(dataTypes: String*): String = s"idx_${toString(dataTypes)}" + override def getIndexName(dataTypes: String*): String = ??? } diff --git a/core/src/test/scala/org/apache/spark/sql/insertion/BatchWritePKAndUniqueIndexSuite.scala b/core/src/test/scala/org/apache/spark/sql/insertion/BatchWritePKAndUniqueIndexSuite.scala new file mode 100644 index 0000000000..74d56307b2 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/insertion/BatchWritePKAndUniqueIndexSuite.scala @@ -0,0 +1,82 @@ +package org.apache.spark.sql.insertion + +import com.pingcap.tikv.meta.TiColumnInfo +import com.pingcap.tispark.datasource.BaseDataSourceTest +import com.pingcap.tispark.utils.TiUtil +import org.apache.spark.sql.Row +import org.apache.spark.sql.test.generator.DataType.ReflectedDataType +import org.apache.spark.sql.test.generator.Schema +import org.apache.spark.sql.test.generator.TestDataGenerator._ + +class BatchWritePKAndUniqueIndexSuite + extends BaseDataSourceTest( + "batch_write_insertion_pk_and_one_unique_index", + "batch_write_test_index" + ) + with EnumerateUniqueIndexDataTypeTestAction { + // TODO: support binary insertion. + override val dataTypes: List[ReflectedDataType] = integers ::: decimals ::: doubles ::: charCharset + override val unsignedDataTypes: List[ReflectedDataType] = integers ::: decimals ::: doubles + override val database = "batch_write_test_pk_and_index" + override val testDesc = "Test for pk and unique index type in batch-write insertion" + + override def beforeAll(): Unit = { + super.beforeAll() + tidbStmt.execute(s"drop database if exists $database") + tidbStmt.execute(s"create database $database") + } + + private def tiRowToSparkRow(row: TiRow, tiColsInfos: java.util.List[TiColumnInfo]) = { + val sparkRow = new Array[Any](row.fieldCount()) + for (i <- 0 until row.fieldCount()) { + val colTp = tiColsInfos.get(i).getType + val colVal = row.get(i, colTp) + sparkRow(i) = colVal + } + Row.fromSeq(sparkRow) + } + + private def dropAndCreateTbl(schema: Schema): Unit = { + // drop table if exits + dropTable(schema.tableName) + + // create table in tidb first + jdbcUpdate(schema.toString) + } + + private def insertAndSelect(schema: Schema): Unit = { + val tblName = schema.tableName + + val tiTblInfo = getTableInfo(database, tblName) + val tiColInfos = tiTblInfo.getColumns + // gen data + val rows = + generateRandomRows(schema, rowCount, r).map(row => tiRowToSparkRow(row, tiColInfos)) + // insert data to tikv + tidbWriteWithTable(rows, TiUtil.getSchemaFromTable(tiTblInfo), tblName) + // select data from tikv and compare with tidb + compareTiDBSelectWithJDBCWithTable_V2(tblName = tblName, "col_bigint") + } + + test("test pk and unique indices cases") { + val schemas = genSchema(dataTypes, table) + + schemas.foreach { schema => + dropAndCreateTbl(schema) + } + + schemas.foreach { schema => + insertAndSelect(schema) + } + } + + // this is only for mute the warning + override def test(): Unit = {} + + override def afterAll(): Unit = + try { + dropTable() + } finally { + super.afterAll() + } +} diff --git a/core/src/test/scala/org/apache/spark/sql/insertion/BatchWritePkSuite.scala b/core/src/test/scala/org/apache/spark/sql/insertion/BatchWritePkSuite.scala index 4500aa1bdd..50b4c0a7db 100644 --- a/core/src/test/scala/org/apache/spark/sql/insertion/BatchWritePkSuite.scala +++ b/core/src/test/scala/org/apache/spark/sql/insertion/BatchWritePkSuite.scala @@ -14,9 +14,8 @@ class BatchWritePkSuite // TODO: support binary insertion. override val dataTypes: List[ReflectedDataType] = integers ::: decimals ::: doubles ::: charCharset override val unsignedDataTypes: List[ReflectedDataType] = integers ::: decimals ::: doubles - override val dataTypeTestDir = "batch-write-test-index" - override val database = "batch_write_test_index" - override val testDesc = "Test for single PK column and multiple unique index type" + override val database = "batch_write_test_pk" + override val testDesc = "Test for single PK column in batch-write insertion" override def beforeAll(): Unit = { super.beforeAll() @@ -56,7 +55,7 @@ class BatchWritePkSuite compareTiDBSelectWithJDBCWithTable_V2(tblName = tblName, "col_bigint") } - test("test unique indices cases") { + test("test pk cases") { val schemas = genSchema(dataTypes, table) schemas.foreach { schema => @@ -68,7 +67,7 @@ class BatchWritePkSuite } } - // this is only for + // this is only for mute the warning override def test(): Unit = {} override def afterAll(): Unit = diff --git a/core/src/test/scala/org/apache/spark/sql/insertion/BatchWriteUniqueIndexSuite.scala b/core/src/test/scala/org/apache/spark/sql/insertion/BatchWriteUniqueIndexSuite.scala index 756381aeaf..5106e67e64 100644 --- a/core/src/test/scala/org/apache/spark/sql/insertion/BatchWriteUniqueIndexSuite.scala +++ b/core/src/test/scala/org/apache/spark/sql/insertion/BatchWriteUniqueIndexSuite.scala @@ -14,9 +14,8 @@ class BatchWriteUniqueIndexSuite // TODO: support binary insertion. override val dataTypes: List[ReflectedDataType] = integers ::: decimals ::: doubles ::: charCharset override val unsignedDataTypes: List[ReflectedDataType] = integers ::: decimals ::: doubles - override val dataTypeTestDir = "batch-write-test-index" override val database = "batch_write_test_index" - override val testDesc = "Test for single PK column and multiple unique index type" + override val testDesc = "Test for single and multiple unique index type in batch-write insertion" override def beforeAll(): Unit = { super.beforeAll() @@ -68,7 +67,7 @@ class BatchWriteUniqueIndexSuite } } - // this is only for + // this is only for mute the warning override def test(): Unit = {} override def afterAll(): Unit = diff --git a/core/src/test/scala/org/apache/spark/sql/insertion/EnumeratePKAndUniqueIndexDataTypeTestAction.scala b/core/src/test/scala/org/apache/spark/sql/insertion/EnumeratePKAndUniqueIndexDataTypeTestAction.scala new file mode 100644 index 0000000000..0851909d7e --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/insertion/EnumeratePKAndUniqueIndexDataTypeTestAction.scala @@ -0,0 +1,68 @@ +package org.apache.spark.sql.insertion + +import org.apache.commons.math3.util.Combinations +import org.apache.spark.sql.test.generator.DataType.ReflectedDataType +import org.apache.spark.sql.test.generator.{DefaultColumn, Index, IndexColumn, Key, PrefixColumn, PrimaryKey} +import org.apache.spark.sql.test.generator.TestDataGenerator.isStringType + +import scala.util.Random + +trait EnumeratePKAndUniqueIndexDataTypeTestAction extends BaseEnumerateDataTypesTestSpec { + private def genPk(dataTypes: List[ReflectedDataType], r: Random): List[Index] = { + val size = dataTypes.length + val keyList = scala.collection.mutable.ListBuffer.empty[PrimaryKey] + for (i <- 0 until size) { + // we add extra one to the column id since 1 is reserved to primary key + val pkCol = if (isStringType(dataTypes(i))) { + PrefixColumn(i + 1, r.nextInt(4) + 2) :: Nil + } else { + DefaultColumn(i + 1) :: Nil + } + keyList += PrimaryKey(pkCol) + } + keyList.toList + } + + private def genUniqueIndex(dataTypes: List[ReflectedDataType], r: Random): List[Index] = { + val size = dataTypes.length + // the first step is generate all possible keys + val keyList = scala.collection.mutable.ListBuffer.empty[Key] + for (i <- 1 until 3) { + val combination = new Combinations(size, i) + //(i, size) + val iterator = combination.iterator() + while (iterator.hasNext) { + val intArray = iterator.next() + val indexColumnList = scala.collection.mutable.ListBuffer.empty[IndexColumn] + // index may have multiple column + for (j <- 0 until intArray.length) { + // we add extra one to the column id since 1 is reserved to primary key + if (isStringType(dataTypes(intArray(j)))) { + indexColumnList += PrefixColumn(intArray(j) + 1, r.nextInt(4) + 2) + } else { + indexColumnList += DefaultColumn(intArray(j) + 1) + } + } + + keyList += Key(indexColumnList.toList) + } + } + + keyList.toList + } + + override def genIndex(dataTypes: List[ReflectedDataType], r: Random): List[List[Index]] = { + val pkIdxList = genPk(dataTypes, r) + val uniqueIdxList = genUniqueIndex(dataTypes, r) + val constraints = scala.collection.mutable.ListBuffer.empty[List[Index]] + for (i <- pkIdxList.indices) { + val tmpIdxList = scala.collection.mutable.ListBuffer.empty[Index] + for (j <- uniqueIdxList.indices) { + tmpIdxList += pkIdxList(i) + tmpIdxList += uniqueIdxList(j) + } + constraints += tmpIdxList.toList + } + constraints.toList + } +} diff --git a/core/src/test/scala/org/apache/spark/sql/insertion/EnumeratePKDataTypeTestAction.scala b/core/src/test/scala/org/apache/spark/sql/insertion/EnumeratePKDataTypeTestAction.scala index 45842f1780..77dd7e5be2 100644 --- a/core/src/test/scala/org/apache/spark/sql/insertion/EnumeratePKDataTypeTestAction.scala +++ b/core/src/test/scala/org/apache/spark/sql/insertion/EnumeratePKDataTypeTestAction.scala @@ -7,9 +7,9 @@ import org.apache.spark.sql.test.generator.{DefaultColumn, Index, PrefixColumn, import scala.util.Random trait EnumeratePKDataTypeTestAction extends BaseEnumerateDataTypesTestSpec { - override def genIndex(dataTypes: List[ReflectedDataType], r: Random): List[Index] = { + override def genIndex(dataTypes: List[ReflectedDataType], r: Random): List[List[Index]] = { val size = dataTypes.length - val keyList = scala.collection.mutable.ListBuffer.empty[PrimaryKey] + val keyList = scala.collection.mutable.ListBuffer.empty[List[PrimaryKey]] for (i <- 0 until size) { // we add extra one to the column id since 1 is reserved to primary key val pkCol = if (isStringType(dataTypes(i))) { @@ -17,7 +17,7 @@ trait EnumeratePKDataTypeTestAction extends BaseEnumerateDataTypesTestSpec { } else { DefaultColumn(i + 1) :: Nil } - keyList += PrimaryKey(pkCol) + keyList += PrimaryKey(pkCol) :: Nil } keyList.toList } diff --git a/core/src/test/scala/org/apache/spark/sql/insertion/EnumerateUniqueIndexDataTypeTestAction.scala b/core/src/test/scala/org/apache/spark/sql/insertion/EnumerateUniqueIndexDataTypeTestAction.scala index e761c73a76..274a03f8fc 100644 --- a/core/src/test/scala/org/apache/spark/sql/insertion/EnumerateUniqueIndexDataTypeTestAction.scala +++ b/core/src/test/scala/org/apache/spark/sql/insertion/EnumerateUniqueIndexDataTypeTestAction.scala @@ -10,10 +10,10 @@ import org.apache.spark.sql.types.MultiColumnDataTypeTestSpec import scala.util.Random trait EnumerateUniqueIndexDataTypeTestAction extends BaseEnumerateDataTypesTestSpec { - override def genIndex(dataTypes: List[ReflectedDataType], r: Random): List[Index] = { + override def genIndex(dataTypes: List[ReflectedDataType], r: Random): List[List[Index]] = { val size = dataTypes.length // the first step is generate all possible keys - val keyList = scala.collection.mutable.ListBuffer.empty[Key] + val keyList = scala.collection.mutable.ListBuffer.empty[List[Key]] for (i <- 1 until 3) { val combination = new Combinations(size, i) //(i, size) @@ -31,7 +31,7 @@ trait EnumerateUniqueIndexDataTypeTestAction extends BaseEnumerateDataTypesTestS } } - keyList += Key(indexColumnList.toList) + keyList += Key(indexColumnList.toList) :: Nil } } diff --git a/core/src/test/scala/org/apache/spark/sql/types/DataTypeTestDir.scala b/core/src/test/scala/org/apache/spark/sql/types/DataTypeTestDir.scala new file mode 100644 index 0000000000..088c4383fe --- /dev/null +++ b/core/src/test/scala/org/apache/spark/sql/types/DataTypeTestDir.scala @@ -0,0 +1,5 @@ +package org.apache.spark.sql.types + +trait DataTypeTestDir { + val dataTypeTestDir: String +} diff --git a/core/src/test/scala/org/apache/spark/sql/types/GenerateMultiColumnDataTypeTestAction.scala b/core/src/test/scala/org/apache/spark/sql/types/GenerateMultiColumnDataTypeTestAction.scala index c967e9f6b2..74b0568f24 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/GenerateMultiColumnDataTypeTestAction.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/GenerateMultiColumnDataTypeTestAction.scala @@ -24,7 +24,8 @@ import org.apache.spark.sql.test.generator.TestDataGenerator.{getDecimal, getLen trait GenerateMultiColumnDataTypeTestAction extends MultiColumnDataTypeTestSpec - with BaseTestGenerationSpec { + with BaseTestGenerationSpec + with DataTypeTestDir { override val rowCount = 50 diff --git a/core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeTestSpec.scala b/core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeTestSpec.scala index 033f88bc97..98d9f1461a 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeTestSpec.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/MultiColumnDataTypeTestSpec.scala @@ -1,20 +1,3 @@ -/* - * - * Copyright 2019 PingCAP, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - package org.apache.spark.sql.types import org.apache.spark.sql.TiSparkTestSpec @@ -23,7 +6,6 @@ import org.apache.spark.sql.test.generator.DataType.ReflectedDataType trait MultiColumnDataTypeTestSpec extends TiSparkTestSpec { val dataTypes: List[ReflectedDataType] val unsignedDataTypes: List[ReflectedDataType] - val dataTypeTestDir: String val extraDesc = "unsigned" } diff --git a/core/src/test/scala/org/apache/spark/sql/types/RunMultiColumnDataTypeTestAction.scala b/core/src/test/scala/org/apache/spark/sql/types/RunMultiColumnDataTypeTestAction.scala index 52b328f7e7..cb52a9991e 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/RunMultiColumnDataTypeTestAction.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/RunMultiColumnDataTypeTestAction.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.types import org.apache.spark.sql.test.generator.DataType.ReflectedDataType -trait RunMultiColumnDataTypeTestAction extends MultiColumnDataTypeTestSpec { +trait RunMultiColumnDataTypeTestAction extends MultiColumnDataTypeTestSpec with DataTypeTestDir { def startTest(dataTypes: List[ReflectedDataType]): Unit = ??? diff --git a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuites.scala b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuites.scala index af5296a3be..9dc6d94a61 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuites.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/pk/MultiColumnPKDataTypeSuites.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.types.pk import org.apache.spark.sql.test.generator.DataType.{getTypeName, BIGINT, INT, ReflectedDataType} import org.apache.spark.sql.test.generator.TestDataGenerator._ -import org.apache.spark.sql.types.{MultiColumnDataTypeTest, RunMultiColumnDataTypeTestAction} +import org.apache.spark.sql.types.{DataTypeTestDir, MultiColumnDataTypeTest, RunMultiColumnDataTypeTestAction} trait MultiColumnPKDataTypeSuites extends MultiColumnDataTypeTest From a1e6b79bcba8196c6e2c8295486a6e2f84c1099e Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Wed, 21 Aug 2019 16:11:53 +0800 Subject: [PATCH 53/62] fix distinct without alias bug: disable pushdown aggregate with alias (#1054) --- .../org/apache/spark/sql/TiStrategy.scala | 4 +++- .../apache/spark/sql/BaseTiSparkTest.scala | 2 +- .../org/apache/spark/sql/IssueTestSuite.scala | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/sql/TiStrategy.scala b/core/src/main/scala/org/apache/spark/sql/TiStrategy.scala index 7afcb81f08..a26e25455a 100644 --- a/core/src/main/scala/org/apache/spark/sql/TiStrategy.scala +++ b/core/src/main/scala/org/apache/spark/sql/TiStrategy.scala @@ -468,7 +468,9 @@ case class TiStrategy(getOrCreateTiContext: SparkSession => TiContext)(sparkSess filters.forall(TiUtil.isSupportedFilter(_, source, blacklist)) && groupingExpressions.forall(TiUtil.isSupportedGroupingExpr(_, source, blacklist)) && aggregateExpressions.forall(TiUtil.isSupportedAggregate(_, source, blacklist)) && - !aggregateExpressions.exists(_.isDistinct) + !aggregateExpressions.exists(_.isDistinct) && + // TODO: This is a temporary fix for the issue: https://github.com/pingcap/tispark/issues/1039 + !groupingExpressions.exists(_.isInstanceOf[Alias]) // We do through similar logic with original Spark as in SparkStrategies.scala // Difference is we need to test if a sub-plan can be consumed all together by TiKV diff --git a/core/src/test/scala/org/apache/spark/sql/BaseTiSparkTest.scala b/core/src/test/scala/org/apache/spark/sql/BaseTiSparkTest.scala index b5e3eba1e1..d53bf756cf 100644 --- a/core/src/test/scala/org/apache/spark/sql/BaseTiSparkTest.scala +++ b/core/src/test/scala/org/apache/spark/sql/BaseTiSparkTest.scala @@ -210,7 +210,7 @@ class BaseTiSparkTest extends QueryTest with SharedSQLContext { protected def judge(str: String, skipped: Boolean = false, checkLimit: Boolean = true): Unit = runTest(str, skipped = skipped, skipJDBC = true, checkLimit = checkLimit) - private def compSparkWithTiDB(sql: String, checkLimit: Boolean = true): Boolean = + protected def compSparkWithTiDB(sql: String, checkLimit: Boolean = true): Boolean = compSqlResult(sql, queryViaTiSpark(sql), queryTiDBViaJDBC(sql), checkLimit) protected def checkSparkResult(sql: String, diff --git a/core/src/test/scala/org/apache/spark/sql/IssueTestSuite.scala b/core/src/test/scala/org/apache/spark/sql/IssueTestSuite.scala index 69e67ac31b..1c4c99fb7d 100644 --- a/core/src/test/scala/org/apache/spark/sql/IssueTestSuite.scala +++ b/core/src/test/scala/org/apache/spark/sql/IssueTestSuite.scala @@ -19,6 +19,25 @@ import com.pingcap.tispark.TiConfigConst import org.apache.spark.sql.functions.{col, sum} class IssueTestSuite extends BaseTiSparkTest { + + // https://github.com/pingcap/tispark/issues/1039 + test("Distinct without alias throws NullPointerException") { + tidbStmt.execute("drop table if exists t_distinct_alias") + tidbStmt.execute("create table t_distinct_alias(c1 bigint);") + tidbStmt.execute("insert into t_distinct_alias values (2), (3), (2);") + + val sqls = "select distinct(c1) as d, 1 as w from t_distinct_alias" :: + "select c1 as d, 1 as w from t_distinct_alias group by c1" :: + "select c1, 1 as w from t_distinct_alias group by c1" :: + "select distinct(c1), 1 as w from t_distinct_alias" :: + Nil + + for (sql <- sqls) { + explainTestAndCollect(sql) + compSparkWithTiDB(sql) + } + } + test("cannot resolve column name when specifying table.column") { spark.sql("select full_data_type_table.id_dt from full_data_type_table").explain(true) judge("select full_data_type_table.id_dt from full_data_type_table") From fdb938e14cc2b30b3cf71561b75e3fda9922075c Mon Sep 17 00:00:00 2001 From: Zhexuan Yang Date: Thu, 22 Aug 2019 11:41:38 +0800 Subject: [PATCH 54/62] improve the doc (#1053) --- README.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 65c77f1c6c..8cc6fd8175 100755 --- a/README.md +++ b/README.md @@ -164,13 +164,16 @@ Below configurations can be put together with spark-defaults.conf or passed in t | spark.tispark.request.isolation.level | "SI" | Isolation level means whether do the resolve lock for the underlying tidb clusters. When you use the "RC", you will get the newest version of record smaller than your tso and ignore the locks. And if you use "SI", you will resolve the locks and get the records according to whether the resolved lock is committed or aborted | ## Log4j Configuration -When you start `spark-shell` or `spark-sql` and run `show databases`, you might see the following warnings: +When you start `spark-shell` or `spark-sql` and run query, you might see the following warnings: ``` -Failed to get database default, returning NoSuchObjectException -Failed to get database global_temp, returning NoSuchObjectException +Failed to get database ****, returning NoSuchObjectException +Failed to get database ****, returning NoSuchObjectException ``` +where `****` is the name of database. -This is due to spark trying to load two nonexistent databases (`default` and `global_temp`) in its catalog. In order to mute these warnings, please append the following text to `${SPARK_HOME}/conf/log4j.properties`. +This is due to spark cannot find `****` in its own catalog. The two warning messages are benign, you can just ignore them. + +If you want to get rid of them, you can append the following text to `${SPARK_HOME}/conf/log4j.properties`. ``` # tispark disable "WARN ObjectStore:568 - Failed to get database" log4j.logger.org.apache.hadoop.hive.metastore.ObjectStore=ERROR From 60eec591c793d577499e26cc0d0291cbd8612ef7 Mon Sep 17 00:00:00 2001 From: birdstorm Date: Fri, 23 Aug 2019 13:09:27 +0800 Subject: [PATCH 55/62] Refactor RegionStoreClient logic (#989) --- .../datasource/BaseDataSourceTest.scala | 24 +- .../apache/spark/sql/BaseTiSparkTest.scala | 24 +- .../org/apache/spark/sql/IssueTestSuite.scala | 4 +- .../org/apache/spark/sql/QueryTest.scala | 23 ++ .../spark/sql/types/BaseDataTypeTest.scala | 2 +- .../com/pingcap/tikv/AbstractGRPCClient.java | 13 + .../main/java/com/pingcap/tikv/KVClient.java | 244 +++++++++++++ .../main/java/com/pingcap/tikv/PDClient.java | 20 +- .../main/java/com/pingcap/tikv/Snapshot.java | 39 ++- .../com/pingcap/tikv/codec/MetaCodec.java | 5 +- .../pingcap/tikv/exception/KeyException.java | 6 + .../tikv/operation/KVErrorHandler.java | 45 ++- .../iterator/ConcreteScanIterator.java | 144 ++++++++ .../tikv/operation/iterator/ScanIterator.java | 151 +++----- .../region/AbstractRegionStoreClient.java | 112 ++++++ .../tikv/region/RegionStoreClient.java | 331 +++++------------- .../tikv/streaming/StreamingResponse.java | 1 - .../pingcap/tikv/txn/LockResolverClient.java | 103 ++---- .../pingcap/tikv/util/ConcreteBackOffer.java | 11 + .../java/com/pingcap/tikv/PDClientTest.java | 2 +- .../pingcap/tikv/txn/LockResolverRCTest.java | 31 +- .../pingcap/tikv/txn/LockResolverSITest.java | 49 ++- .../pingcap/tikv/txn/LockResolverTest.java | 52 ++- 23 files changed, 933 insertions(+), 503 deletions(-) create mode 100644 tikv-client/src/main/java/com/pingcap/tikv/KVClient.java create mode 100644 tikv-client/src/main/java/com/pingcap/tikv/operation/iterator/ConcreteScanIterator.java create mode 100644 tikv-client/src/main/java/com/pingcap/tikv/region/AbstractRegionStoreClient.java diff --git a/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala b/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala index d9730352cf..4a154ac475 100644 --- a/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala +++ b/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala @@ -6,6 +6,7 @@ import com.pingcap.tikv.TiSession import com.pingcap.tispark.TiConfigConst import org.apache.spark.SparkException import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.analysis.NoSuchTableException import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{BaseTiSparkTest, DataFrame, Row} @@ -82,7 +83,7 @@ class BaseDataSourceTest(val table: String, sortCol: String = "i", selectCol: String = null, tableName: String - ) = { + ): Unit = { // check data source result & expected answer var df = queryDatasourceTiDBWithTable(sortCol, tableName) if (selectCol != null) { @@ -173,7 +174,14 @@ class BaseDataSourceTest(val table: String, val answer = seqRowToList(expectedAnswer, schema) val jdbcResult = queryTiDBViaJDBC(sql) - val df = queryDatasourceTiDB(sortCol) + val df = try { + queryDatasourceTiDB(sortCol) + } catch { + case e: NoSuchTableException => + logger.warn("query via datasource api fails", e) + spark.sql("show tables").show + throw e + } val tidbResult = seqRowToList(df.collect(), df.schema) // check tidb result & expected answer @@ -202,13 +210,11 @@ class BaseDataSourceTest(val table: String, val df = queryDatasourceTiDBWithTable(sortCol, tableName = tblName) val tidbResult = seqRowToList(df.collect(), df.schema) - println(s"running test on table $tblName") - if (compResult(jdbcResult, tidbResult)) { - assert(true) - } else { - println(s"failed on $tblName") - println(tidbResult) - assert(false) + if (!compResult(jdbcResult, tidbResult)) { + logger.error(s"""Failed on $tblName\n + |DataSourceAPI result: ${listToString(jdbcResult)}\n + |TiDB via JDBC result: ${listToString(tidbResult)}""".stripMargin) + fail() } } diff --git a/core/src/test/scala/org/apache/spark/sql/BaseTiSparkTest.scala b/core/src/test/scala/org/apache/spark/sql/BaseTiSparkTest.scala index d53bf756cf..3b4b2656d7 100644 --- a/core/src/test/scala/org/apache/spark/sql/BaseTiSparkTest.scala +++ b/core/src/test/scala/org/apache/spark/sql/BaseTiSparkTest.scala @@ -124,6 +124,7 @@ class BaseTiSparkTest extends QueryTest with SharedSQLContext { protected def loadTestData(databases: Seq[String] = defaultTestDatabases): Unit = try { + ti.meta.reloadAllMeta() tableNames = Seq.empty[String] for (dbName <- databases) { setCurrentDatabase(dbName) @@ -395,29 +396,6 @@ class BaseTiSparkTest extends QueryTest with SharedSQLContext { } } - private def listToString(result: List[List[Any]]): String = - if (result == null) s"[len: null] = null" - else if (result.isEmpty) s"[len: 0] = Empty" - else s"[len: ${result.length}] = ${result.map(mapStringList).mkString(",")}" - - private def mapStringList(result: List[Any]): String = - if (result == null) "null" else "List(" + result.map(mapString).mkString(",") + ")" - - private def mapString(result: Any): String = - if (result == null) "null" - else - result match { - case _: Array[Byte] => - var str = "[" - for (s <- result.asInstanceOf[Array[Byte]]) { - str += " " + s.toString - } - str += " ]" - str - case _ => - result.toString - } - protected def explainTestAndCollect(sql: String): Unit = { val df = spark.sql(sql) df.explain diff --git a/core/src/test/scala/org/apache/spark/sql/IssueTestSuite.scala b/core/src/test/scala/org/apache/spark/sql/IssueTestSuite.scala index 1c4c99fb7d..d1d138cf44 100644 --- a/core/src/test/scala/org/apache/spark/sql/IssueTestSuite.scala +++ b/core/src/test/scala/org/apache/spark/sql/IssueTestSuite.scala @@ -245,7 +245,7 @@ class IssueTestSuite extends BaseTiSparkTest { tidbStmt.execute("insert into t values(1)") tidbStmt.execute("insert into t values(2)") tidbStmt.execute("insert into t values(4)") - ti.meta.reloadAllMeta() + loadTestData() runTest("select count(c1) from t") runTest("select count(c1 + 1) from t") runTest("select count(1 + c1) from t") @@ -253,7 +253,7 @@ class IssueTestSuite extends BaseTiSparkTest { tidbStmt.execute("create table t(c1 int not null, c2 int not null)") tidbStmt.execute("insert into t values(1, 4)") tidbStmt.execute("insert into t values(2, 2)") - ti.meta.reloadAllMeta() + loadTestData() runTest("select count(c1 + c2) from t") } diff --git a/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index 8b139a0ca8..12663189f2 100644 --- a/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -196,6 +196,29 @@ abstract class QueryTest extends SparkFunSuite { } } + def listToString(result: List[List[Any]]): String = + if (result == null) s"[len: null] = null" + else if (result.isEmpty) s"[len: 0] = Empty" + else s"[len: ${result.length}] = ${result.map(mapStringList).mkString(",")}" + + private def mapStringList(result: List[Any]): String = + if (result == null) "null" else "List(" + result.map(mapString).mkString(",") + ")" + + private def mapString(result: Any): String = + if (result == null) "null" + else + result match { + case _: Array[Byte] => + var str = "[" + for (s <- result.asInstanceOf[Array[Byte]]) { + str += " " + s.toString + } + str += " ]" + str + case _ => + result.toString + } + protected def toOutput(value: Any, colType: String): Any = value match { case _: BigDecimal => value.asInstanceOf[BigDecimal].setScale(2, BigDecimal.RoundingMode.HALF_UP) diff --git a/core/src/test/scala/org/apache/spark/sql/types/BaseDataTypeTest.scala b/core/src/test/scala/org/apache/spark/sql/types/BaseDataTypeTest.scala index cca945b84f..14ab2ad745 100644 --- a/core/src/test/scala/org/apache/spark/sql/types/BaseDataTypeTest.scala +++ b/core/src/test/scala/org/apache/spark/sql/types/BaseDataTypeTest.scala @@ -35,7 +35,7 @@ trait BaseDataTypeTest extends BaseTiSparkTest { setCurrentDatabase(dbName) val tblName = generator.getTableNameWithDesc(desc, dataType) val query = s"select ${generator.getColumnName(dataType)} from $tblName" - println(query) + logger.info(query) runTest(query) } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/AbstractGRPCClient.java b/tikv-client/src/main/java/com/pingcap/tikv/AbstractGRPCClient.java index a56716010f..2e27778600 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/AbstractGRPCClient.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/AbstractGRPCClient.java @@ -37,12 +37,25 @@ public abstract class AbstractGRPCClient< protected final Logger logger = Logger.getLogger(this.getClass()); protected TiConfiguration conf; protected final ChannelFactory channelFactory; + protected BlockingStubT blockingStub; + protected StubT asyncStub; protected AbstractGRPCClient(TiConfiguration conf, ChannelFactory channelFactory) { this.conf = conf; this.channelFactory = channelFactory; } + protected AbstractGRPCClient( + TiConfiguration conf, + ChannelFactory channelFactory, + BlockingStubT blockingStub, + StubT asyncStub) { + this.conf = conf; + this.channelFactory = channelFactory; + this.blockingStub = blockingStub; + this.asyncStub = asyncStub; + } + public TiConfiguration getConf() { return conf; } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/KVClient.java b/tikv-client/src/main/java/com/pingcap/tikv/KVClient.java new file mode 100644 index 0000000000..9abb1a61f2 --- /dev/null +++ b/tikv-client/src/main/java/com/pingcap/tikv/KVClient.java @@ -0,0 +1,244 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package com.pingcap.tikv; + +import com.google.protobuf.ByteString; +import com.pingcap.tikv.exception.GrpcException; +import com.pingcap.tikv.exception.TiKVException; +import com.pingcap.tikv.operation.iterator.ConcreteScanIterator; +import com.pingcap.tikv.region.RegionStoreClient; +import com.pingcap.tikv.region.RegionStoreClient.*; +import com.pingcap.tikv.region.TiRegion; +import com.pingcap.tikv.util.BackOffFunction; +import com.pingcap.tikv.util.BackOffer; +import com.pingcap.tikv.util.ConcreteBackOffer; +import java.util.*; +import java.util.concurrent.*; +import java.util.stream.Collectors; +import org.apache.log4j.Logger; +import org.tikv.kvproto.Kvrpcpb; +import org.tikv.kvproto.Kvrpcpb.KvPair; + +public class KVClient implements AutoCloseable { + private final RegionStoreClientBuilder clientBuilder; + private final TiConfiguration conf; + private final ExecutorService executorService; + private static final Logger logger = Logger.getLogger(KVClient.class); + + private static final int BATCH_GET_SIZE = 16 * 1024; + + public KVClient(TiConfiguration conf, RegionStoreClientBuilder clientBuilder) { + Objects.requireNonNull(conf, "conf is null"); + Objects.requireNonNull(clientBuilder, "clientBuilder is null"); + this.conf = conf; + this.clientBuilder = clientBuilder; + // TODO: ExecutorService executors = + // Executors.newFixedThreadPool(conf.getKVClientConcurrency()); + executorService = Executors.newFixedThreadPool(20); + } + + @Override + public void close() { + if (executorService != null) { + executorService.shutdownNow(); + } + } + + /** + * Get a key-value pair from TiKV if key exists + * + * @param key key + * @return a ByteString value if key exists, ByteString.EMPTY if key does not exist + */ + public ByteString get(ByteString key, long version) throws GrpcException { + BackOffer backOffer = ConcreteBackOffer.newGetBackOff(); + while (true) { + RegionStoreClient client = clientBuilder.build(key); + try { + return client.get(backOffer, key, version); + } catch (final TiKVException e) { + backOffer.doBackOff(BackOffFunction.BackOffFuncType.BoRegionMiss, e); + } + } + } + + /** + * Get a set of key-value pair by keys from TiKV + * + * @param keys keys + */ + public List batchGet(List keys, long version) throws GrpcException { + return batchGet(ConcreteBackOffer.newBatchGetMaxBackOff(), keys, version); + } + + private List batchGet(BackOffer backOffer, List keys, long version) { + Set set = new HashSet<>(keys); + return batchGet(backOffer, set, version); + } + + private List batchGet(BackOffer backOffer, Set keys, long version) { + Map> groupKeys = groupKeysByRegion(keys); + List batches = new ArrayList<>(); + + for (Map.Entry> entry : groupKeys.entrySet()) { + appendBatches(batches, entry.getKey(), entry.getValue(), BATCH_GET_SIZE); + } + return sendBatchGet(backOffer, batches, version); + } + + /** + * Scan key-value pairs from TiKV in range [startKey, endKey) + * + * @param startKey start key, inclusive + * @param endKey end key, exclusive + * @return list of key-value pairs in range + */ + public List scan(ByteString startKey, ByteString endKey, long version) + throws GrpcException { + Iterator iterator = + scanIterator(conf, clientBuilder, startKey, endKey, version); + List result = new ArrayList<>(); + iterator.forEachRemaining(result::add); + return result; + } + + /** + * Scan key-value pairs from TiKV in range [startKey, ♾), maximum to `limit` pairs + * + * @param startKey start key, inclusive + * @param limit limit of kv pairs + * @return list of key-value pairs in range + */ + public List scan(ByteString startKey, long version, int limit) + throws GrpcException { + Iterator iterator = scanIterator(conf, clientBuilder, startKey, version, limit); + List result = new ArrayList<>(); + iterator.forEachRemaining(result::add); + return result; + } + + public List scan(ByteString startKey, long version) throws GrpcException { + return scan(startKey, version, Integer.MAX_VALUE); + } + + /** A Batch containing the region and a list of keys to send */ + private static final class Batch { + private final TiRegion region; + private final List keys; + + Batch(TiRegion region, List keys) { + this.region = region; + this.keys = keys; + } + } + + /** + * Append batch to list and split them according to batch limit + * + * @param batches a grouped batch + * @param region region + * @param keys keys + * @param limit batch max limit + */ + private void appendBatches( + List batches, TiRegion region, List keys, int limit) { + List tmpKeys = new ArrayList<>(); + for (int i = 0; i < keys.size(); i++) { + if (i >= limit) { + batches.add(new Batch(region, tmpKeys)); + tmpKeys.clear(); + } + tmpKeys.add(keys.get(i)); + } + if (!tmpKeys.isEmpty()) { + batches.add(new Batch(region, tmpKeys)); + } + } + + /** + * Group by list of keys according to its region + * + * @param keys keys + * @return a mapping of keys and their region + */ + private Map> groupKeysByRegion(Set keys) { + return keys.stream() + .collect(Collectors.groupingBy(clientBuilder.getRegionManager()::getRegionByKey)); + } + + /** + * Send batchPut request concurrently + * + * @param backOffer current backOffer + * @param batches list of batch to send + */ + private List sendBatchGet(BackOffer backOffer, List batches, long version) { + ExecutorCompletionService> completionService = + new ExecutorCompletionService<>(executorService); + for (Batch batch : batches) { + completionService.submit( + () -> { + RegionStoreClient client = clientBuilder.build(batch.region); + BackOffer singleBatchBackOffer = ConcreteBackOffer.create(backOffer); + List keys = batch.keys; + try { + return client.batchGet(singleBatchBackOffer, keys, version); + } catch (final TiKVException e) { + // TODO: any elegant way to re-split the ranges if fails? + singleBatchBackOffer.doBackOff(BackOffFunction.BackOffFuncType.BoRegionMiss, e); + logger.warn("ReSplitting ranges for BatchPutRequest"); + // recursive calls + return batchGet(singleBatchBackOffer, batch.keys, version); + } + }); + } + try { + List result = new ArrayList<>(); + for (int i = 0; i < batches.size(); i++) { + result.addAll( + completionService.take().get(BackOffer.BATCH_GET_MAX_BACKOFF, TimeUnit.SECONDS)); + } + return result; + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new TiKVException("Current thread interrupted.", e); + } catch (TimeoutException e) { + throw new TiKVException("TimeOut Exceeded for current operation. ", e); + } catch (ExecutionException e) { + throw new TiKVException("Execution exception met.", e); + } + } + + private Iterator scanIterator( + TiConfiguration conf, + RegionStoreClientBuilder builder, + ByteString startKey, + ByteString endKey, + long version) { + return new ConcreteScanIterator(conf, builder, startKey, endKey, version); + } + + private Iterator scanIterator( + TiConfiguration conf, + RegionStoreClientBuilder builder, + ByteString startKey, + long version, + int limit) { + return new ConcreteScanIterator(conf, builder, startKey, version, limit); + } +} diff --git a/tikv-client/src/main/java/com/pingcap/tikv/PDClient.java b/tikv-client/src/main/java/com/pingcap/tikv/PDClient.java index 2b85f8f6cd..c17de7ad4e 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/PDClient.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/PDClient.java @@ -199,7 +199,7 @@ LeaderWrapper getLeaderWrapper() { return leaderWrapper; } - class LeaderWrapper { + static class LeaderWrapper { private final String leaderInfo; private final PDBlockingStub blockingStub; private final PDStub asyncStub; @@ -329,6 +329,9 @@ protected PDStub getAsyncStub() { private PDClient(TiConfiguration conf, ChannelFactory channelFactory) { super(conf, channelFactory); + initCluster(); + this.blockingStub = getBlockingStub(); + this.asyncStub = getAsyncStub(); } private void initCluster() { @@ -364,19 +367,6 @@ private void initCluster() { } static PDClient createRaw(TiConfiguration conf, ChannelFactory channelFactory) { - PDClient client = null; - try { - client = new PDClient(conf, channelFactory); - client.initCluster(); - } catch (Exception e) { - if (client != null) { - try { - client.close(); - } catch (InterruptedException ignore) { - } - } - throw e; - } - return client; + return new PDClient(conf, channelFactory); } } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/Snapshot.java b/tikv-client/src/main/java/com/pingcap/tikv/Snapshot.java index c2d5934413..a51a9c1f27 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/Snapshot.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/Snapshot.java @@ -19,13 +19,12 @@ import static com.pingcap.tikv.operation.iterator.CoprocessIterator.getRowIterator; import com.google.protobuf.ByteString; +import com.pingcap.tikv.key.Key; import com.pingcap.tikv.meta.TiDAGRequest; import com.pingcap.tikv.meta.TiTimestamp; +import com.pingcap.tikv.operation.iterator.ConcreteScanIterator; import com.pingcap.tikv.operation.iterator.IndexScanIterator; -import com.pingcap.tikv.operation.iterator.ScanIterator; -import com.pingcap.tikv.region.RegionStoreClient; import com.pingcap.tikv.row.Row; -import com.pingcap.tikv.util.ConcreteBackOffer; import com.pingcap.tikv.util.RangeSplitter; import com.pingcap.tikv.util.RangeSplitter.RegionTask; import java.util.Iterator; @@ -63,9 +62,8 @@ public byte[] get(byte[] key) { } public ByteString get(ByteString key) { - RegionStoreClient client = session.getRegionStoreClientBuilder().build(key); - // TODO: Need to deal with lock error after grpc stable - return client.get(ConcreteBackOffer.newGetBackOff(), key, timestamp.getVersion()); + return new KVClient(session.getConf(), session.getRegionStoreClientBuilder()) + .get(key, timestamp.getVersion()); } /** @@ -110,8 +108,35 @@ public Iterator indexHandleRead(TiDAGRequest dagRequest, List return getHandleIterator(dagRequest, tasks, session); } + /** + * scan all keys after startKey, inclusive + * + * @param startKey start of keys + * @return iterator of kvPair + */ public Iterator scan(ByteString startKey) { - return new ScanIterator(startKey, session, timestamp.getVersion()); + return new ConcreteScanIterator( + session.getConf(), + session.getRegionStoreClientBuilder(), + startKey, + timestamp.getVersion(), + Integer.MAX_VALUE); + } + + /** + * scan all keys with prefix + * + * @param prefix prefix of keys + * @return iterator of kvPair + */ + public Iterator scanPrefix(ByteString prefix) { + ByteString nextPrefix = Key.toRawKey(prefix).nextPrefix().toByteString(); + return new ConcreteScanIterator( + session.getConf(), + session.getRegionStoreClientBuilder(), + prefix, + nextPrefix, + timestamp.getVersion()); } public TiConfiguration getConf() { diff --git a/tikv-client/src/main/java/com/pingcap/tikv/codec/MetaCodec.java b/tikv-client/src/main/java/com/pingcap/tikv/codec/MetaCodec.java index 0d9f4c963f..48f3c94c9b 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/codec/MetaCodec.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/codec/MetaCodec.java @@ -98,16 +98,13 @@ public static List> hashGetFields( MetaCodec.encodeHashDataKeyPrefix(cdo, key.toByteArray()); ByteString encodedKey = cdo.toByteString(); - Iterator iterator = snapshot.scan(encodedKey); + Iterator iterator = snapshot.scanPrefix(encodedKey); List> fields = new ArrayList<>(); while (iterator.hasNext()) { Kvrpcpb.KvPair kv = iterator.next(); if (kv == null || kv.getKey() == null) { continue; } - if (!KeyUtils.hasPrefix(kv.getKey(), encodedKey)) { - break; - } fields.add(Pair.create(MetaCodec.decodeHashDataKey(kv.getKey()).second, kv.getValue())); } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/exception/KeyException.java b/tikv-client/src/main/java/com/pingcap/tikv/exception/KeyException.java index 789fc7c2c2..dc33c08425 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/exception/KeyException.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/exception/KeyException.java @@ -20,6 +20,7 @@ public class KeyException extends TiKVException { private static final long serialVersionUID = 6649195220216182286L; + private Kvrpcpb.KeyError keyError; public KeyException(String errMsg) { super(errMsg); @@ -27,5 +28,10 @@ public KeyException(String errMsg) { public KeyException(Kvrpcpb.KeyError keyErr) { super(String.format("Key exception occurred and the reason is %s", keyErr.toString())); + this.keyError = keyErr; + } + + public Kvrpcpb.KeyError getKeyError() { + return keyError; } } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/operation/KVErrorHandler.java b/tikv-client/src/main/java/com/pingcap/tikv/operation/KVErrorHandler.java index fbc9caaa84..8f00ac48dd 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/operation/KVErrorHandler.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/operation/KVErrorHandler.java @@ -17,41 +17,57 @@ package com.pingcap.tikv.operation; +import static com.pingcap.tikv.txn.LockResolverClient.extractLockFromKeyErr; +import static com.pingcap.tikv.util.BackOffFunction.BackOffFuncType.BoTxnLockFast; + import com.google.protobuf.ByteString; import com.pingcap.tikv.codec.KeyUtils; import com.pingcap.tikv.event.CacheInvalidateEvent; import com.pingcap.tikv.exception.GrpcException; +import com.pingcap.tikv.exception.KeyException; import com.pingcap.tikv.region.RegionErrorReceiver; import com.pingcap.tikv.region.RegionManager; import com.pingcap.tikv.region.TiRegion; +import com.pingcap.tikv.txn.Lock; +import com.pingcap.tikv.txn.LockResolverClient; import com.pingcap.tikv.util.BackOffFunction; import com.pingcap.tikv.util.BackOffer; import io.grpc.Status; import io.grpc.StatusRuntimeException; +import java.util.ArrayList; +import java.util.Collections; import java.util.function.Function; import org.apache.log4j.Logger; import org.tikv.kvproto.Errorpb; +import org.tikv.kvproto.Kvrpcpb; // TODO: consider refactor to Builder mode +// TODO: KVErrorHandler should resolve locks if it could. public class KVErrorHandler implements ErrorHandler { private static final Logger logger = Logger.getLogger(KVErrorHandler.class); - private static final int NO_LEADER_STORE_ID = - 0; // if there's currently no leader of a store, store id is set to 0 + // if a store does not have leader currently, store id is set to 0 + private static final int NO_LEADER_STORE_ID = 0; private final Function getRegionError; + private final Function getKeyError; private final Function cacheInvalidateCallBack; private final RegionManager regionManager; private final RegionErrorReceiver recv; + private final LockResolverClient lockResolverClient; private final TiRegion ctxRegion; public KVErrorHandler( RegionManager regionManager, RegionErrorReceiver recv, + LockResolverClient lockResolverClient, TiRegion ctxRegion, - Function getRegionError) { + Function getRegionError, + Function getKeyError) { this.ctxRegion = ctxRegion; this.recv = recv; + this.lockResolverClient = lockResolverClient; this.regionManager = regionManager; this.getRegionError = getRegionError; + this.getKeyError = getKeyError; this.cacheInvalidateCallBack = regionManager != null ? regionManager.getCacheInvalidateCallback() : null; } @@ -117,6 +133,19 @@ private void notifyStoreCacheInvalidate(long storeId) { } } + private boolean checkLockError(BackOffer backOffer, Lock lock) { + logger.warn("resolving lock"); + boolean ok = + lockResolverClient.resolveLocks( + backOffer, new ArrayList<>(Collections.singletonList(lock))); + if (!ok) { + // if not resolve all locks, we wait and retry + backOffer.doBackOff(BoTxnLockFast, new KeyException(lock.toString())); + return true; + } + return false; + } + // Referenced from TiDB // store/tikv/region_request.go - onRegionError @Override @@ -233,6 +262,16 @@ public boolean handleResponseError(BackOffer backOffer, RespT resp) { invalidateRegionStoreCache(ctxRegion); } + // Key error handling logic + Kvrpcpb.KeyError keyError = getKeyError.apply(resp); + if (keyError != null) { + try { + Lock lock = extractLockFromKeyErr(keyError); + checkLockError(backOffer, lock); + } catch (KeyException e) { + logger.warn("Unable to handle KeyExceptions other than LockException", e); + } + } return false; } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/operation/iterator/ConcreteScanIterator.java b/tikv-client/src/main/java/com/pingcap/tikv/operation/iterator/ConcreteScanIterator.java new file mode 100644 index 0000000000..36988f15d8 --- /dev/null +++ b/tikv-client/src/main/java/com/pingcap/tikv/operation/iterator/ConcreteScanIterator.java @@ -0,0 +1,144 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package com.pingcap.tikv.operation.iterator; + +import static java.util.Objects.requireNonNull; + +import com.google.protobuf.ByteString; +import com.pingcap.tikv.TiConfiguration; +import com.pingcap.tikv.exception.GrpcException; +import com.pingcap.tikv.exception.KeyException; +import com.pingcap.tikv.key.Key; +import com.pingcap.tikv.region.RegionStoreClient; +import com.pingcap.tikv.region.RegionStoreClient.RegionStoreClientBuilder; +import com.pingcap.tikv.region.TiRegion; +import com.pingcap.tikv.util.BackOffer; +import com.pingcap.tikv.util.ConcreteBackOffer; +import com.pingcap.tikv.util.Pair; +import org.apache.log4j.Logger; +import org.tikv.kvproto.Kvrpcpb; +import org.tikv.kvproto.Kvrpcpb.KvPair; +import org.tikv.kvproto.Metapb; + +public class ConcreteScanIterator extends ScanIterator { + private final long version; + private final Logger logger = Logger.getLogger(ConcreteScanIterator.class); + + public ConcreteScanIterator( + TiConfiguration conf, + RegionStoreClientBuilder builder, + ByteString startKey, + long version, + int limit) { + // Passing endKey as ByteString.EMPTY means that endKey is +INF by default, + this(conf, builder, startKey, ByteString.EMPTY, version, limit); + } + + public ConcreteScanIterator( + TiConfiguration conf, + RegionStoreClientBuilder builder, + ByteString startKey, + ByteString endKey, + long version) { + // Passing endKey as ByteString.EMPTY means that endKey is +INF by default, + this(conf, builder, startKey, endKey, version, Integer.MAX_VALUE); + } + + private ConcreteScanIterator( + TiConfiguration conf, + RegionStoreClientBuilder builder, + ByteString startKey, + ByteString endKey, + long version, + int limit) { + super(conf, builder, startKey, endKey, limit); + this.version = version; + } + + TiRegion loadCurrentRegionToCache() throws GrpcException { + TiRegion region; + try (RegionStoreClient client = builder.build(startKey)) { + region = client.getRegion(); + BackOffer backOffer = ConcreteBackOffer.newScannerNextMaxBackOff(); + currentCache = client.scan(backOffer, startKey, version); + return region; + } + } + + private ByteString resolveCurrentLock(Kvrpcpb.KvPair current) { + logger.warn(String.format("resolve current key error %s", current.getError().toString())); + Pair pair = + builder.getRegionManager().getRegionStorePairByKey(startKey); + TiRegion region = pair.first; + Metapb.Store store = pair.second; + BackOffer backOffer = ConcreteBackOffer.newGetBackOff(); + try (RegionStoreClient client = builder.build(region, store)) { + return client.get(backOffer, current.getKey(), version); + } catch (Exception e) { + throw new KeyException(current.getError()); + } + } + + @Override + public boolean hasNext() { + Kvrpcpb.KvPair current; + // continue when cache is empty but not null + do { + current = getCurrent(); + if (isCacheDrained() && cacheLoadFails()) { + endOfScan = true; + return false; + } + } while (currentCache != null && current == null); + // for last batch to be processed, we have to check if + return !processingLastBatch + || current == null + || (hasEndKey && Key.toRawKey(current.getKey()).compareTo(endKey) < 0); + } + + @Override + public KvPair next() { + --limit; + KvPair current = currentCache.get(index++); + + requireNonNull(current, "current kv pair cannot be null"); + if (current.hasError()) { + ByteString val = resolveCurrentLock(current); + current = KvPair.newBuilder().setKey(current.getKey()).setValue(val).build(); + } + + return current; + } + + /** + * Cache is drained when - no data extracted - scan limit was not defined - have read the last + * index of cache - index not initialized + * + * @return whether cache is drained + */ + private boolean isCacheDrained() { + return currentCache == null || limit <= 0 || index >= currentCache.size() || index == -1; + } + + private Kvrpcpb.KvPair getCurrent() { + if (isCacheDrained()) { + return null; + } + return currentCache.get(index); + } +} diff --git a/tikv-client/src/main/java/com/pingcap/tikv/operation/iterator/ScanIterator.java b/tikv-client/src/main/java/com/pingcap/tikv/operation/iterator/ScanIterator.java index d60b431d84..1851e3608c 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/operation/iterator/ScanIterator.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/operation/iterator/ScanIterator.java @@ -18,133 +18,98 @@ import static java.util.Objects.requireNonNull; import com.google.protobuf.ByteString; -import com.pingcap.tikv.TiSession; -import com.pingcap.tikv.exception.KeyException; +import com.pingcap.tikv.TiConfiguration; +import com.pingcap.tikv.exception.GrpcException; import com.pingcap.tikv.exception.TiClientInternalException; import com.pingcap.tikv.key.Key; -import com.pingcap.tikv.region.RegionManager; -import com.pingcap.tikv.region.RegionStoreClient; +import com.pingcap.tikv.region.RegionStoreClient.RegionStoreClientBuilder; import com.pingcap.tikv.region.TiRegion; -import com.pingcap.tikv.util.BackOffer; -import com.pingcap.tikv.util.ConcreteBackOffer; -import com.pingcap.tikv.util.Pair; import java.util.Iterator; import java.util.List; -import java.util.Objects; -import org.apache.log4j.Logger; import org.tikv.kvproto.Kvrpcpb; -import org.tikv.kvproto.Kvrpcpb.KvPair; -import org.tikv.kvproto.Metapb; -public class ScanIterator implements Iterator { - private final Logger logger = Logger.getLogger(ScanIterator.class); - protected final TiSession session; - private final RegionManager regionCache; - protected final long version; - - private List currentCache; +public abstract class ScanIterator implements Iterator { + protected final TiConfiguration conf; + protected final RegionStoreClientBuilder builder; + protected List currentCache; protected ByteString startKey; protected int index = -1; - private boolean endOfScan = false; + protected int limit; + protected boolean endOfScan = false; + + protected Key endKey; + protected boolean hasEndKey; + protected boolean processingLastBatch = false; - public ScanIterator(ByteString startKey, TiSession session, long version) { + ScanIterator( + TiConfiguration conf, + RegionStoreClientBuilder builder, + ByteString startKey, + ByteString endKey, + int limit) { this.startKey = requireNonNull(startKey, "start key is null"); if (startKey.isEmpty()) { throw new IllegalArgumentException("start key cannot be empty"); } - this.session = session; - this.regionCache = session.getRegionManager(); - this.version = version; + this.endKey = Key.toRawKey(requireNonNull(endKey, "end key is null")); + this.hasEndKey = !endKey.equals(ByteString.EMPTY); + this.limit = limit; + this.conf = conf; + this.builder = builder; } - // return false if current cache is not loaded or empty - private boolean loadCache() { - if (endOfScan) { - return false; + /** + * Load current region to cache, returns the region if loaded. + * + * @return TiRegion of current data loaded to cache + * @throws GrpcException if scan still fails after backoff + */ + abstract TiRegion loadCurrentRegionToCache() throws GrpcException; + + // return true if current cache is not loaded or empty + boolean cacheLoadFails() { + if (endOfScan || processingLastBatch) { + return true; } - if (startKey.isEmpty()) { - return false; + if (startKey == null || startKey.isEmpty()) { + return true; } - Pair pair = regionCache.getRegionStorePairByKey(startKey); - TiRegion region = pair.first; - Metapb.Store store = pair.second; - try (RegionStoreClient client = session.getRegionStoreClientBuilder().build(region, store)) { - BackOffer backOffer = ConcreteBackOffer.newScannerNextMaxBackOff(); - currentCache = client.scan(backOffer, startKey, version); + try { + TiRegion region = loadCurrentRegionToCache(); + ByteString curRegionEndKey = region.getEndKey(); // currentCache is null means no keys found, whereas currentCache is empty means no values // found. The difference lies in whether to continue scanning, because chances are that // an empty region exists due to deletion, region split, e.t.c. // See https://github.com/pingcap/tispark/issues/393 for details if (currentCache == null) { - return false; + return true; } index = 0; + Key lastKey = Key.EMPTY; // Session should be single-threaded itself // so that we don't worry about conf change in the middle // of a transaction. Otherwise below code might lose data - if (currentCache.size() < session.getConf().getScanBatchSize()) { - // Current region done, start new batch from next region - startKey = region.getEndKey(); + if (currentCache.size() < conf.getScanBatchSize()) { + startKey = curRegionEndKey; + } else if (currentCache.size() > conf.getScanBatchSize()) { + throw new IndexOutOfBoundsException( + "current cache size = " + + currentCache.size() + + ", larger than " + + conf.getScanBatchSize()); } else { // Start new scan from exact next key in current region - Key lastKey = Key.toRawKey(currentCache.get(currentCache.size() - 1).getKey()); + lastKey = Key.toRawKey(currentCache.get(currentCache.size() - 1).getKey()); startKey = lastKey.next().toByteString(); } + // notify last batch if lastKey is greater than or equal to endKey + if (hasEndKey && lastKey.compareTo(endKey) >= 0) { + processingLastBatch = true; + startKey = null; + } } catch (Exception e) { throw new TiClientInternalException("Error scanning data from region.", e); } - return true; - } - - private boolean isCacheDrained() { - return currentCache == null || index >= currentCache.size() || index == -1; - } - - @Override - public boolean hasNext() { - if (isCacheDrained() && !loadCache()) { - endOfScan = true; - return false; - } - return true; - } - - private Kvrpcpb.KvPair getCurrent() { - if (isCacheDrained()) { - return null; - } - return currentCache.get(index++); - } - - private ByteString resolveCurrentLock(Kvrpcpb.KvPair current) { - logger.warn(String.format("resolve current key error %s", current.getError().toString())); - Pair pair = regionCache.getRegionStorePairByKey(startKey); - TiRegion region = pair.first; - Metapb.Store store = pair.second; - BackOffer backOffer = ConcreteBackOffer.newGetBackOff(); - try (RegionStoreClient client = session.getRegionStoreClientBuilder().build(region, store)) { - return client.get(backOffer, current.getKey(), version); - } catch (Exception e) { - throw new KeyException(current.getError()); - } - } - - @Override - public Kvrpcpb.KvPair next() { - Kvrpcpb.KvPair current; - // continue when cache is empty but not null - for (current = getCurrent(); currentCache != null && current == null; current = getCurrent()) { - if (!loadCache()) { - return null; - } - } - - Objects.requireNonNull(current, "current kv pair cannot be null"); - if (current.hasError()) { - ByteString val = resolveCurrentLock(current); - current = KvPair.newBuilder().setKey(current.getKey()).setValue(val).build(); - } - - return current; + return false; } } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/region/AbstractRegionStoreClient.java b/tikv-client/src/main/java/com/pingcap/tikv/region/AbstractRegionStoreClient.java new file mode 100644 index 0000000000..b366ec8bd0 --- /dev/null +++ b/tikv-client/src/main/java/com/pingcap/tikv/region/AbstractRegionStoreClient.java @@ -0,0 +1,112 @@ +/* + * + * Copyright 2019 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package com.pingcap.tikv.region; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; + +import com.pingcap.tikv.AbstractGRPCClient; +import com.pingcap.tikv.TiConfiguration; +import com.pingcap.tikv.exception.GrpcException; +import com.pingcap.tikv.util.ChannelFactory; +import io.grpc.ManagedChannel; +import org.tikv.kvproto.Metapb; +import org.tikv.kvproto.TikvGrpc; + +public abstract class AbstractRegionStoreClient + extends AbstractGRPCClient + implements RegionErrorReceiver { + + protected TiRegion region; + protected final RegionManager regionManager; + + protected AbstractRegionStoreClient( + TiConfiguration conf, + TiRegion region, + ChannelFactory channelFactory, + TikvGrpc.TikvBlockingStub blockingStub, + TikvGrpc.TikvStub asyncStub, + RegionManager regionManager) { + super(conf, channelFactory, blockingStub, asyncStub); + checkNotNull(region, "Region is empty"); + checkNotNull(region.getLeader(), "Leader Peer is null"); + checkArgument(region.getLeader() != null, "Leader Peer is null"); + this.region = region; + this.regionManager = regionManager; + } + + public TiRegion getRegion() { + return region; + } + + @Override + protected TikvGrpc.TikvBlockingStub getBlockingStub() { + return blockingStub.withDeadlineAfter(getConf().getTimeout(), getConf().getTimeoutUnit()); + } + + @Override + protected TikvGrpc.TikvStub getAsyncStub() { + return asyncStub.withDeadlineAfter(getConf().getTimeout(), getConf().getTimeoutUnit()); + } + + @Override + public void close() throws GrpcException {} + + /** + * onNotLeader deals with NotLeaderError and returns whether re-splitting key range is needed + * + * @param newStore the new store presented by NotLeader Error + * @return false when re-split is needed. + */ + @Override + public boolean onNotLeader(Metapb.Store newStore) { + if (logger.isDebugEnabled()) { + logger.debug(region + ", new leader = " + newStore.getId()); + } + TiRegion cachedRegion = regionManager.getRegionById(region.getId()); + // When switch leader fails or the region changed its key range, + // it would be necessary to re-split task's key range for new region. + if (!region.getStartKey().equals(cachedRegion.getStartKey()) + || !region.getEndKey().equals(cachedRegion.getEndKey())) { + return false; + } + region = cachedRegion; + String addressStr = regionManager.getStoreById(region.getLeader().getStoreId()).getAddress(); + ManagedChannel channel = channelFactory.getChannel(addressStr); + blockingStub = TikvGrpc.newBlockingStub(channel); + asyncStub = TikvGrpc.newStub(channel); + return true; + } + + @Override + public void onStoreNotMatch(Metapb.Store store) { + String addressStr = store.getAddress(); + ManagedChannel channel = channelFactory.getChannel(addressStr); + blockingStub = TikvGrpc.newBlockingStub(channel); + asyncStub = TikvGrpc.newStub(channel); + if (logger.isDebugEnabled() && region.getLeader().getStoreId() != store.getId()) { + logger.debug( + "store_not_match may occur? " + + region + + ", original store = " + + store.getId() + + " address = " + + addressStr); + } + } +} diff --git a/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java b/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java index 612e387236..a432bba1eb 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java @@ -17,9 +17,8 @@ package com.pingcap.tikv.region; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; import static com.pingcap.tikv.region.RegionStoreClient.RequestTypes.REQ_TYPE_DAG; +import static com.pingcap.tikv.txn.LockResolverClient.extractLockFromKeyErr; import static com.pingcap.tikv.util.BackOffFunction.BackOffFuncType.*; import com.google.common.annotations.VisibleForTesting; @@ -27,7 +26,6 @@ import com.google.protobuf.InvalidProtocolBufferException; import com.pingcap.tidb.tipb.DAGRequest; import com.pingcap.tidb.tipb.SelectResponse; -import com.pingcap.tikv.AbstractGRPCClient; import com.pingcap.tikv.TiConfiguration; import com.pingcap.tikv.exception.*; import com.pingcap.tikv.operation.KVErrorHandler; @@ -38,7 +36,6 @@ import io.grpc.ManagedChannel; import java.util.*; import java.util.function.Supplier; -import java.util.stream.Collectors; import org.apache.log4j.Logger; import org.tikv.kvproto.Coprocessor; import org.tikv.kvproto.Coprocessor.KeyRange; @@ -51,9 +48,14 @@ import org.tikv.kvproto.TikvGrpc.TikvBlockingStub; import org.tikv.kvproto.TikvGrpc.TikvStub; -// RegionStore itself is not thread-safe -public class RegionStoreClient extends AbstractGRPCClient - implements RegionErrorReceiver { +// Note that RegionStoreClient itself is not thread-safe +// TODO: +// 1. RegionStoreClient will be inaccessible directly. +// 2. All apis of RegionStoreClient would not provide retry aside from callWithRetry, +// if a request needs to be retried because of an un-retryable cause, e.g., keys +// need to be re-split across regions/stores, region info outdated, e.t.c., you should +// retry it in an upper client logic (KVClient, TxnClient, e.t.c.) +public class RegionStoreClient extends AbstractRegionStoreClient { public enum RequestTypes { REQ_TYPE_SELECT(101), REQ_TYPE_INDEX(102), @@ -73,73 +75,42 @@ public int getValue() { } private static final Logger logger = Logger.getLogger(RegionStoreClient.class); - private TiRegion region; - private final RegionManager regionManager; @VisibleForTesting public final LockResolverClient lockResolverClient; - private TikvBlockingStub blockingStub; - private TikvStub asyncStub; - - public TiRegion getRegion() { - return region; - } - - private boolean checkLockError(BackOffer backOffer, KeyError error) { - if (error.hasLocked()) { - Lock lock = new Lock(error.getLocked()); - logger.warn("resolving lock"); - boolean ok = - lockResolverClient.resolveLocks( - backOffer, new ArrayList<>(Collections.singletonList(lock))); - if (!ok) { - // if not resolve all locks, we wait and retry - backOffer.doBackOff(BoTxnLockFast, new KeyException((error.getLocked().toString()))); - } - return false; - } else { - // retry or abort - // this should trigger Spark to retry the txn - throw new KeyException(error); - } - } /** * Fetch a value according to a key * - * @param backOffer - * @param key - * @param version - * @return + * @param backOffer backOffer + * @param key key to fetch + * @param version key version + * @return value * @throws TiClientInternalException * @throws KeyException */ public ByteString get(BackOffer backOffer, ByteString key, long version) throws TiClientInternalException, KeyException { - while (true) { - // we should refresh region - region = regionManager.getRegionByKey(key); - - Supplier factory = - () -> - GetRequest.newBuilder() - .setContext(region.getContext()) - .setKey(key) - .setVersion(version) - .build(); + Supplier factory = + () -> + GetRequest.newBuilder() + .setContext(region.getContext()) + .setKey(key) + .setVersion(version) + .build(); - KVErrorHandler handler = - new KVErrorHandler<>( - regionManager, - this, - region, - resp -> resp.hasRegionError() ? resp.getRegionError() : null); + KVErrorHandler handler = + new KVErrorHandler<>( + regionManager, + this, + lockResolverClient, + region, + resp -> resp.hasRegionError() ? resp.getRegionError() : null, + resp -> resp.hasError() ? resp.getError() : null); - GetResponse resp = callWithRetry(backOffer, TikvGrpc.METHOD_KV_GET, factory, handler); + GetResponse resp = callWithRetry(backOffer, TikvGrpc.METHOD_KV_GET, factory, handler); - if (isGetSuccess(backOffer, resp)) { - return resp.getValue(); - } - } + handleGetResponse(backOffer, resp); + return resp.getValue(); } /** @@ -151,67 +122,42 @@ public ByteString get(BackOffer backOffer, ByteString key, long version) * @throws TiClientInternalException * @throws KeyException */ - private boolean isGetSuccess(BackOffer backOffer, GetResponse resp) + private void handleGetResponse(BackOffer backOffer, GetResponse resp) throws TiClientInternalException, KeyException { if (resp == null) { this.regionManager.onRequestFail(region); throw new TiClientInternalException("GetResponse failed without a cause"); } if (resp.hasRegionError()) { - backOffer.doBackOff(BoRegionMiss, new RegionException(resp.getRegionError())); - return false; + throw new RegionException(resp.getRegionError()); } - if (resp.hasError()) { - return checkLockError(backOffer, resp.getError()); + throw new KeyException(resp.getError()); } - return true; } - public List batchGet(BackOffer backOffer, List keys, long version) { - List result = new ArrayList<>(); - while (true) { - // re-split keys - Map> map = - keys.stream().collect(Collectors.groupingBy(regionManager::getRegionByKey)); - boolean ok = true; - for (Map.Entry> entry : map.entrySet()) { - TiRegion newRegion = entry.getKey(); - if (!newRegion.equals(region)) { - RegionStoreClient newRegionStoreClient = - new RegionStoreClientBuilder(conf, this.channelFactory, this.regionManager) - .build(newRegion); - result.addAll(newRegionStoreClient.batchGet(backOffer, entry.getValue(), version)); - } else { - Supplier request = - () -> - BatchGetRequest.newBuilder() - .setContext(region.getContext()) - .addAllKeys(entry.getValue()) - .setVersion(version) - .build(); - KVErrorHandler handler = - new KVErrorHandler<>( - regionManager, - this, - region, - resp -> resp.hasRegionError() ? resp.getRegionError() : null); - BatchGetResponse resp = - callWithRetry(backOffer, TikvGrpc.METHOD_KV_BATCH_GET, request, handler); - if (isBatchGetSuccess(backOffer, resp)) { - result.addAll(resp.getPairsList()); - } else { - ok = false; - } - } - } - if (ok) { - return result; - } - } + public List batchGet(BackOffer backOffer, Iterable keys, long version) { + Supplier request = + () -> + BatchGetRequest.newBuilder() + .setContext(region.getContext()) + .addAllKeys(keys) + .setVersion(version) + .build(); + KVErrorHandler handler = + new KVErrorHandler<>( + regionManager, + this, + lockResolverClient, + region, + resp -> resp.hasRegionError() ? resp.getRegionError() : null, + resp -> null); + BatchGetResponse resp = + callWithRetry(backOffer, TikvGrpc.METHOD_KV_BATCH_GET, request, handler); + return handleBatchGetResponse(backOffer, resp); } - private boolean isBatchGetSuccess(BackOffer bo, BatchGetResponse resp) { + private List handleBatchGetResponse(BackOffer bo, BatchGetResponse resp) { if (resp == null) { this.regionManager.onRequestFail(region); throw new TiClientInternalException("BatchGetResponse failed without a cause"); @@ -235,12 +181,13 @@ private boolean isBatchGetSuccess(BackOffer bo, BatchGetResponse resp) { if (!locks.isEmpty()) { boolean ok = lockResolverClient.resolveLocks(bo, locks); if (!ok) { - // if not resolve all locks, we wait and retry - bo.doBackOff(BoTxnLockFast, new KeyException((resp.getPairsList().get(0).getError()))); - return false; + // resolveLocks already retried, just throw error to upper logic. + throw new TiKVException("locks not resolved, retry"); } + + // FIXME: we should retry } - return true; + return resp.getPairsList(); } public List scan( @@ -263,8 +210,10 @@ public List scan( new KVErrorHandler<>( regionManager, this, + lockResolverClient, region, - resp -> resp.hasRegionError() ? resp.getRegionError() : null); + resp -> resp.hasRegionError() ? resp.getRegionError() : null, + resp -> null); ScanResponse resp = callWithRetry(backOffer, TikvGrpc.METHOD_KV_SCAN, request, handler); if (isScanSuccess(backOffer, resp)) { return doScan(resp); @@ -305,33 +254,6 @@ private List doScan(ScanResponse resp) { return Collections.unmodifiableList(newKvPairs); } - private Lock extractLockFromKeyErr(KeyError keyError) { - if (keyError.hasLocked()) { - return new Lock(keyError.getLocked()); - } - - if (keyError.hasConflict()) { - WriteConflict conflict = keyError.getConflict(); - throw new KeyException( - String.format( - "scan meet key conflict on primary key %s at commit ts %s", - conflict.getPrimary(), conflict.getConflictTs())); - } - - if (!keyError.getRetryable().isEmpty()) { - throw new KeyException( - String.format("tikv restart txn %s", keyError.getRetryableBytes().toStringUtf8())); - } - - if (!keyError.getAbort().isEmpty()) { - throw new KeyException( - String.format("tikv abort txn %s", keyError.getAbortBytes().toStringUtf8())); - } - - throw new KeyException( - String.format("unexpected key error meets and it is %s", keyError.toString())); - } - public List scan(BackOffer backOffer, ByteString startKey, long version) { return scan(backOffer, startKey, version, false); } @@ -394,8 +316,10 @@ public void prewrite( new KVErrorHandler<>( regionManager, this, + lockResolverClient, region, - resp -> resp.hasRegionError() ? resp.getRegionError() : null); + resp -> resp.hasRegionError() ? resp.getRegionError() : null, + resp -> null); PrewriteResponse resp = callWithRetry(bo, TikvGrpc.METHOD_KV_PREWRITE, factory, handler); if (isPrewriteSuccess(bo, resp)) { return; @@ -452,26 +376,24 @@ private boolean isPrewriteSuccess(BackOffer backOffer, PrewriteResponse resp) public void commit( BackOffer backOffer, Iterable keys, long startVersion, long commitVersion) throws KeyException { - while (true) { - Supplier factory = - () -> - CommitRequest.newBuilder() - .setStartVersion(startVersion) - .setCommitVersion(commitVersion) - .addAllKeys(keys) - .setContext(region.getContext()) - .build(); - KVErrorHandler handler = - new KVErrorHandler<>( - regionManager, - this, - region, - resp -> resp.hasRegionError() ? resp.getRegionError() : null); - CommitResponse resp = callWithRetry(backOffer, TikvGrpc.METHOD_KV_COMMIT, factory, handler); - if (isCommitSuccess(backOffer, resp)) { - break; - } - } + Supplier factory = + () -> + CommitRequest.newBuilder() + .setStartVersion(startVersion) + .setCommitVersion(commitVersion) + .addAllKeys(keys) + .setContext(region.getContext()) + .build(); + KVErrorHandler handler = + new KVErrorHandler<>( + regionManager, + this, + lockResolverClient, + region, + resp -> resp.hasRegionError() ? resp.getRegionError() : null, + resp -> resp.hasError() ? resp.getError() : null); + CommitResponse resp = callWithRetry(backOffer, TikvGrpc.METHOD_KV_COMMIT, factory, handler); + handleCommitResponse(backOffer, resp); } /** @@ -484,7 +406,7 @@ public void commit( * @throws RegionException * @throws KeyException */ - private boolean isCommitSuccess(BackOffer backOffer, CommitResponse resp) + private void handleCommitResponse(BackOffer backOffer, CommitResponse resp) throws TiClientInternalException, RegionException, KeyException { if (resp == null) { this.regionManager.onRequestFail(region); @@ -498,9 +420,8 @@ private boolean isCommitSuccess(BackOffer backOffer, CommitResponse resp) } // If we find locks, we first resolve and let its caller retry. if (resp.hasError()) { - return checkLockError(backOffer, resp.getError()); + throw new KeyException(resp.getError()); } - return true; } /** @@ -533,8 +454,10 @@ public List coprocess( new KVErrorHandler<>( regionManager, this, + lockResolverClient, region, - resp -> resp.hasRegionError() ? resp.getRegionError() : null); + resp -> resp.hasRegionError() ? resp.getRegionError() : null, + resp -> null); Coprocessor.Response resp = callWithRetry(backOffer, TikvGrpc.METHOD_COPROCESSOR, reqToSend, handler); return handleCopResponse(backOffer, resp, ranges, responseQueue); @@ -633,9 +556,10 @@ public Iterator coprocessStreaming(DAGRequest req, List( regionManager, this, + lockResolverClient, region, - StreamingResponse::getFirstRegionError // TODO: handle all errors in streaming response - ); + StreamingResponse::getFirstRegionError, // TODO: handle all errors in streaming response + resp -> null); StreamingResponse responseIterator = this.callServerStreamingWithRetry( @@ -661,7 +585,7 @@ public RegionStoreClientBuilder( this.regionManager = regionManager; } - public RegionStoreClient build(TiRegion region, Store store) { + public RegionStoreClient build(TiRegion region, Store store) throws GrpcException { Objects.requireNonNull(region, "region is null"); Objects.requireNonNull(store, "store is null"); @@ -678,12 +602,12 @@ public RegionStoreClient build(TiRegion region, Store store) { conf, region, channelFactory, blockingStub, asyncStub, regionManager); } - public RegionStoreClient build(ByteString key) { + public RegionStoreClient build(ByteString key) throws GrpcException { Pair pair = regionManager.getRegionStorePairByKey(key); return build(pair.first, pair.second); } - public RegionStoreClient build(TiRegion region) { + public RegionStoreClient build(TiRegion region) throws GrpcException { Store store = regionManager.getStoreById(region.getLeader().getStoreId()); return build(region, store); } @@ -700,72 +624,9 @@ private RegionStoreClient( TikvBlockingStub blockingStub, TikvStub asyncStub, RegionManager regionManager) { - super(conf, channelFactory); - checkNotNull(region, "Region is empty"); - checkNotNull(region.getLeader(), "Leader Peer is null"); - checkArgument(region.getLeader() != null, "Leader Peer is null"); - this.regionManager = regionManager; - this.region = region; - this.blockingStub = blockingStub; - this.asyncStub = asyncStub; + super(conf, region, channelFactory, blockingStub, asyncStub, regionManager); this.lockResolverClient = new LockResolverClient( - conf, this.blockingStub, this.asyncStub, channelFactory, regionManager); - } - - @Override - protected TikvBlockingStub getBlockingStub() { - return blockingStub.withDeadlineAfter(getConf().getTimeout(), getConf().getTimeoutUnit()); - } - - @Override - protected TikvStub getAsyncStub() { - return asyncStub.withDeadlineAfter(getConf().getTimeout(), getConf().getTimeoutUnit()); - } - - @Override - public void close() throws Exception {} - - /** - * onNotLeader deals with NotLeaderError and returns whether re-splitting key range is needed - * - * @param newStore the new store presented by NotLeader Error - * @return false when re-split is needed. - */ - @Override - public boolean onNotLeader(Store newStore) { - if (logger.isDebugEnabled()) { - logger.debug(region + ", new leader = " + newStore.getId()); - } - TiRegion cachedRegion = regionManager.getRegionById(region.getId()); - // When switch leader fails or the region changed its key range, - // it would be necessary to re-split task's key range for new region. - if (!region.getStartKey().equals(cachedRegion.getStartKey()) - || !region.getEndKey().equals(cachedRegion.getEndKey())) { - return false; - } - region = cachedRegion; - String addressStr = regionManager.getStoreById(region.getLeader().getStoreId()).getAddress(); - ManagedChannel channel = channelFactory.getChannel(addressStr); - blockingStub = TikvGrpc.newBlockingStub(channel); - asyncStub = TikvGrpc.newStub(channel); - return true; - } - - @Override - public void onStoreNotMatch(Store store) { - String addressStr = store.getAddress(); - ManagedChannel channel = channelFactory.getChannel(addressStr); - blockingStub = TikvGrpc.newBlockingStub(channel); - asyncStub = TikvGrpc.newStub(channel); - if (logger.isDebugEnabled() && region.getLeader().getStoreId() != store.getId()) { - logger.debug( - "store_not_match may occur? " - + region - + ", original store = " - + store.getId() - + " address = " - + addressStr); - } + conf, region, this.blockingStub, this.asyncStub, channelFactory, regionManager); } } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/streaming/StreamingResponse.java b/tikv-client/src/main/java/com/pingcap/tikv/streaming/StreamingResponse.java index bf2af25bf4..e791afdccf 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/streaming/StreamingResponse.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/streaming/StreamingResponse.java @@ -58,7 +58,6 @@ public Errorpb.Error getFirstRegionError() { return response.getRegionError(); } } - return null; } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/txn/LockResolverClient.java b/tikv-client/src/main/java/com/pingcap/tikv/txn/LockResolverClient.java index a03f62d1e8..af439c4095 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/txn/LockResolverClient.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/txn/LockResolverClient.java @@ -20,36 +20,33 @@ import static com.pingcap.tikv.util.BackOffFunction.BackOffFuncType.BoRegionMiss; import com.google.protobuf.ByteString; -import com.pingcap.tikv.AbstractGRPCClient; import com.pingcap.tikv.TiConfiguration; import com.pingcap.tikv.exception.KeyException; import com.pingcap.tikv.exception.RegionException; import com.pingcap.tikv.operation.KVErrorHandler; -import com.pingcap.tikv.region.RegionErrorReceiver; +import com.pingcap.tikv.region.AbstractRegionStoreClient; import com.pingcap.tikv.region.RegionManager; import com.pingcap.tikv.region.TiRegion; import com.pingcap.tikv.region.TiRegion.RegionVerID; import com.pingcap.tikv.util.BackOffer; import com.pingcap.tikv.util.ChannelFactory; import com.pingcap.tikv.util.TsoUtils; -import io.grpc.ManagedChannel; import java.util.*; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.function.Supplier; import org.apache.log4j.Logger; +import org.tikv.kvproto.Kvrpcpb; import org.tikv.kvproto.Kvrpcpb.CleanupRequest; import org.tikv.kvproto.Kvrpcpb.CleanupResponse; import org.tikv.kvproto.Kvrpcpb.ResolveLockRequest; import org.tikv.kvproto.Kvrpcpb.ResolveLockResponse; -import org.tikv.kvproto.Metapb.Store; import org.tikv.kvproto.TikvGrpc; import org.tikv.kvproto.TikvGrpc.TikvBlockingStub; import org.tikv.kvproto.TikvGrpc.TikvStub; // LockResolver resolves locks and also caches resolved txn status. -public class LockResolverClient extends AbstractGRPCClient - implements RegionErrorReceiver { +public class LockResolverClient extends AbstractRegionStoreClient { // ResolvedCacheSize is max number of cached txn status. private static final long RESOLVED_TXN_CACHE_SIZE = 2048; // By default, locks after 3000ms is considered unusual (the client created the @@ -70,24 +67,18 @@ public class LockResolverClient extends AbstractGRPCClient resolved; // the list is chain of txn for O(1) lru cache private final Queue recentResolved; - private TikvBlockingStub blockingStub; - private TikvStub asyncStub; - private TiRegion region; - private final RegionManager regionManager; public LockResolverClient( TiConfiguration conf, + TiRegion region, TikvBlockingStub blockingStub, TikvStub asyncStub, ChannelFactory channelFactory, RegionManager regionManager) { - super(conf, channelFactory); + super(conf, region, channelFactory, blockingStub, asyncStub, regionManager); resolved = new HashMap<>(); recentResolved = new LinkedList<>(); readWriteLock = new ReentrantReadWriteLock(); - this.blockingStub = blockingStub; - this.regionManager = regionManager; - this.asyncStub = asyncStub; } private void saveResolved(long txnID, long status) { @@ -139,9 +130,10 @@ public Long getTxnStatus(BackOffer bo, Long txnID, ByteString primary) { new KVErrorHandler<>( regionManager, this, + this, region, - resp -> resp.hasRegionError() ? resp.getRegionError() : null); - + resp -> resp.hasRegionError() ? resp.getRegionError() : null, + resp -> resp.hasError() ? resp.getError() : null); CleanupResponse resp = callWithRetry(bo, TikvGrpc.METHOD_KV_CLEANUP, factory, handler); status = 0L; @@ -164,6 +156,33 @@ public Long getTxnStatus(BackOffer bo, Long txnID, ByteString primary) { } } + public static Lock extractLockFromKeyErr(Kvrpcpb.KeyError keyError) { + if (keyError.hasLocked()) { + return new Lock(keyError.getLocked()); + } + + if (keyError.hasConflict()) { + Kvrpcpb.WriteConflict conflict = keyError.getConflict(); + throw new KeyException( + String.format( + "scan meet key conflict on primary key %s at commit ts %s", + conflict.getPrimary(), conflict.getConflictTs())); + } + + if (!keyError.getRetryable().isEmpty()) { + throw new KeyException( + String.format("tikv restart txn %s", keyError.getRetryableBytes().toStringUtf8())); + } + + if (!keyError.getAbort().isEmpty()) { + throw new KeyException( + String.format("tikv abort txn %s", keyError.getAbortBytes().toStringUtf8())); + } + + throw new KeyException( + String.format("unexpected key error meets and it is %s", keyError.toString())); + } + // ResolveLocks tries to resolve Locks. The resolving process is in 3 steps: // 1) Use the `lockTTL` to pick up all expired locks. Only locks that are old // enough are considered orphan locks and will be handled later. If all locks @@ -236,9 +255,10 @@ private void resolveLock(BackOffer bo, Lock lock, long txnStatus, Set( regionManager, this, + this, region, - resp -> resp.hasRegionError() ? resp.getRegionError() : null); - + resp -> resp.hasRegionError() ? resp.getRegionError() : null, + resp -> resp.hasError() ? resp.getError() : null); ResolveLockResponse resp = callWithRetry(bo, TikvGrpc.METHOD_KV_RESOLVE_LOCK, factory, handler); @@ -257,51 +277,4 @@ private void resolveLock(BackOffer bo, Lock lock, long txnStatus, Set(); } + private ConcreteBackOffer(ConcreteBackOffer source) { + this.maxSleep = source.maxSleep; + this.totalSleep = source.totalSleep; + this.errors = source.errors; + this.backOffFunctionMap = source.backOffFunctionMap; + } + /** * Creates a back off func which implements exponential back off with optional jitters according * to different back off strategies. See http://www.awsarchitectureblog.com/2015/03/backoff.html diff --git a/tikv-client/src/test/java/com/pingcap/tikv/PDClientTest.java b/tikv-client/src/test/java/com/pingcap/tikv/PDClientTest.java index f67f5710ac..a290b0b053 100644 --- a/tikv-client/src/test/java/com/pingcap/tikv/PDClientTest.java +++ b/tikv-client/src/test/java/com/pingcap/tikv/PDClientTest.java @@ -258,7 +258,7 @@ public void testRetryPolicy() throws Exception { () -> client.getStore(ConcreteBackOffer.newCustomBackOff(5000), 0); Future storeFuture = service.submit(storeCallable); try { - Store r = storeFuture.get(5, TimeUnit.SECONDS); + Store r = storeFuture.get(50, TimeUnit.SECONDS); assertEquals(r.getId(), storeId); } catch (TimeoutException e) { fail(); diff --git a/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverRCTest.java b/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverRCTest.java index e763592b26..1238db9322 100644 --- a/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverRCTest.java +++ b/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverRCTest.java @@ -54,6 +54,7 @@ public void getRCTest() { skipTest(); return; } + session.getConf().setIsolationLevel(IsolationLevel.RC); putAlphabet(); prepareAlphabetLocks(); @@ -69,28 +70,36 @@ public void RCTest() { TiTimestamp startTs = pdClient.getTimestamp(backOffer); TiTimestamp endTs = pdClient.getTimestamp(backOffer); + // Put into kv putKV("a", "a", startTs.getVersion(), endTs.getVersion()); startTs = pdClient.getTimestamp(backOffer); endTs = pdClient.getTimestamp(backOffer); - lockKey("a", "aa", "a", "aa", false, startTs.getVersion(), endTs.getVersion()); + // Prewrite as primary without committing it + assertTrue(lockKey("a", "aa", "a", "aa", false, startTs.getVersion(), endTs.getVersion())); - TiRegion tiRegion = - session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8(String.valueOf('a'))); + TiRegion tiRegion = session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8("a")); RegionStoreClient client = builder.build(tiRegion); + // In RC mode, lock will not be read. is retrieved. ByteString v = client.get( - backOffer, - ByteString.copyFromUtf8(String.valueOf('a')), - pdClient.getTimestamp(backOffer).getVersion()); - assertEquals(v.toStringUtf8(), String.valueOf('a')); + backOffer, ByteString.copyFromUtf8("a"), pdClient.getTimestamp(backOffer).getVersion()); + assertEquals(v.toStringUtf8(), "a"); try { - commit( - startTs.getVersion(), - endTs.getVersion(), - Collections.singletonList(ByteString.copyFromUtf8("a"))); + // After committing , we can read it. + assertTrue( + commit( + startTs.getVersion(), + endTs.getVersion(), + Collections.singletonList(ByteString.copyFromUtf8("a")))); + v = + client.get( + backOffer, + ByteString.copyFromUtf8("a"), + pdClient.getTimestamp(backOffer).getVersion()); + assertEquals(v.toStringUtf8(), "aa"); } catch (KeyException e) { fail(); } diff --git a/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverSITest.java b/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverSITest.java index 5b684c3d8c..661ab4183b 100644 --- a/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverSITest.java +++ b/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverSITest.java @@ -57,10 +57,11 @@ public void getSITest() { skipTest(); return; } + session.getConf().setIsolationLevel(IsolationLevel.SI); putAlphabet(); prepareAlphabetLocks(); - versionTest(); + versionTest(true); } @Test @@ -73,7 +74,7 @@ public void cleanLockTest() { String k = String.valueOf((char) ('a' + i)); TiTimestamp startTs = pdClient.getTimestamp(backOffer); TiTimestamp endTs = pdClient.getTimestamp(backOffer); - lockKey(k, k, k, k, false, startTs.getVersion(), endTs.getVersion()); + assertTrue(lockKey(k, k, k, k, false, startTs.getVersion(), endTs.getVersion())); } List mutations = new ArrayList<>(); @@ -124,36 +125,33 @@ public void txnStatusTest() { TiTimestamp endTs = pdClient.getTimestamp(backOffer); putKV("a", "a", startTs.getVersion(), endTs.getVersion()); - TiRegion tiRegion = - session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8(String.valueOf('a'))); + TiRegion tiRegion = session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8("a")); RegionStoreClient client = builder.build(tiRegion); long status = client.lockResolverClient.getTxnStatus( - backOffer, startTs.getVersion(), ByteString.copyFromUtf8(String.valueOf('a'))); + backOffer, startTs.getVersion(), ByteString.copyFromUtf8("a")); assertEquals(status, endTs.getVersion()); startTs = pdClient.getTimestamp(backOffer); endTs = pdClient.getTimestamp(backOffer); - lockKey("a", "a", "a", "a", true, startTs.getVersion(), endTs.getVersion()); - tiRegion = - session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8(String.valueOf('a'))); + assertTrue(lockKey("a", "a", "a", "a", true, startTs.getVersion(), endTs.getVersion())); + tiRegion = session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8("a")); client = builder.build(tiRegion); status = client.lockResolverClient.getTxnStatus( - backOffer, startTs.getVersion(), ByteString.copyFromUtf8(String.valueOf('a'))); + backOffer, startTs.getVersion(), ByteString.copyFromUtf8("a")); assertEquals(status, endTs.getVersion()); startTs = pdClient.getTimestamp(backOffer); endTs = pdClient.getTimestamp(backOffer); - lockKey("a", "a", "a", "a", false, startTs.getVersion(), endTs.getVersion()); - tiRegion = - session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8(String.valueOf('a'))); + assertTrue(lockKey("a", "a", "a", "a", false, startTs.getVersion(), endTs.getVersion())); + tiRegion = session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8("a")); client = builder.build(tiRegion); status = client.lockResolverClient.getTxnStatus( - backOffer, startTs.getVersion(), ByteString.copyFromUtf8(String.valueOf('a'))); + backOffer, startTs.getVersion(), ByteString.copyFromUtf8("a")); assertNotSame(status, endTs.getVersion()); } @@ -166,30 +164,43 @@ public void SITest() { TiTimestamp startTs = pdClient.getTimestamp(backOffer); TiTimestamp endTs = pdClient.getTimestamp(backOffer); + // Put into kv putKV("a", "a", startTs.getVersion(), endTs.getVersion()); startTs = pdClient.getTimestamp(backOffer); endTs = pdClient.getTimestamp(backOffer); - lockKey("a", "aa", "a", "aa", false, startTs.getVersion(), endTs.getVersion()); + // Prewrite as primary without committing it + assertTrue(lockKey("a", "aa", "a", "aa", false, startTs.getVersion(), endTs.getVersion())); - TiRegion tiRegion = - session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8(String.valueOf('a'))); + TiRegion tiRegion = session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8("a")); RegionStoreClient client = builder.build(tiRegion); + + try { + // In SI mode, a lock is read. Try resolve it if expires TTL. + client.get( + backOffer, ByteString.copyFromUtf8("a"), pdClient.getTimestamp(backOffer).getVersion()); + fail(); + } catch (KeyException e) { + assertEquals(ByteString.copyFromUtf8("a"), e.getKeyError().getLocked().getKey()); + } + + // With TTL set to 10, after 10 milliseconds is resolved. + // We should be able to read instead. ByteString v = client.get( - backOffer, - ByteString.copyFromUtf8(String.valueOf('a')), - pdClient.getTimestamp(backOffer).getVersion()); + backOffer, ByteString.copyFromUtf8("a"), pdClient.getTimestamp(backOffer).getVersion()); assertEquals(v.toStringUtf8(), String.valueOf('a')); try { + // Trying to continue the commit phase of will fail because TxnLockNotFound commit( startTs.getVersion(), endTs.getVersion(), Collections.singletonList(ByteString.copyFromUtf8("a"))); fail(); } catch (KeyException e) { + assertFalse(e.getKeyError().getRetryable().isEmpty()); } } } diff --git a/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverTest.java b/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverTest.java index 296bd2ee41..24e64209b4 100644 --- a/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverTest.java +++ b/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverTest.java @@ -69,6 +69,18 @@ void putKV(String key, String value, long startTS, long commitTS) { boolean prewrite(List mutations, long startTS, Mutation primary) { if (mutations.size() == 0) return true; + /*for (Mutation m : mutations) { + while (true) { + try { + TiRegion region = session.getRegionManager().getRegionByKey(m.getKey()); + RegionStoreClient client = builder.build(region); + client.prewrite(backOffer, primary.getKey(), mutations, startTS, DefaultTTL); + break; + } catch (Exception e) { + logger.warn(e.getMessage()); + } + } + }*/ for (Mutation m : mutations) { TiRegion region = session.getRegionManager().getRegionByKey(m.getKey()); RegionStoreClient client = builder.build(region); @@ -87,9 +99,10 @@ boolean prewrite(List mutations, long startTS, Mutation primary) { new KVErrorHandler<>( session.getRegionManager(), client, + client.lockResolverClient, region, - resp -> resp.hasRegionError() ? resp.getRegionError() : null); - + resp -> resp.hasRegionError() ? resp.getRegionError() : null, + resp -> null); PrewriteResponse resp = client.callWithRetry(backOffer, TikvGrpc.METHOD_KV_PREWRITE, factory, handler); @@ -193,9 +206,10 @@ boolean commit(long startTS, long commitTS, List keys) { new KVErrorHandler<>( session.getRegionManager(), client, + client.lockResolverClient, tiRegion, - resp -> resp.hasRegionError() ? resp.getRegionError() : null); - + resp -> resp.hasRegionError() ? resp.getRegionError() : null, + resp -> resp.hasError() ? resp.getError() : null); CommitResponse resp = client.callWithRetry(backOffer, TikvGrpc.METHOD_KV_COMMIT, factory, handler); @@ -249,18 +263,28 @@ void skipTest() { } void versionTest() { + versionTest(false); + } + + void versionTest(boolean hasLock) { for (int i = 0; i < 26; i++) { - TiRegion tiRegion = - session - .getRegionManager() - .getRegionByKey(ByteString.copyFromUtf8(String.valueOf((char) ('a' + i)))); + ByteString key = ByteString.copyFromUtf8(String.valueOf((char) ('a' + i))); + TiRegion tiRegion = session.getRegionManager().getRegionByKey(key); RegionStoreClient client = builder.build(tiRegion); - ByteString v = - client.get( - backOffer, - ByteString.copyFromUtf8(String.valueOf((char) ('a' + i))), - pdClient.getTimestamp(backOffer).getVersion()); - assertEquals(v.toStringUtf8(), String.valueOf((char) ('a' + i))); + try { + ByteString v = client.get(backOffer, key, pdClient.getTimestamp(backOffer).getVersion()); + if (hasLock && i == 3) { + // key "d" should be locked + fail(); + } else { + assertEquals(String.valueOf((char) ('a' + i)), v.toStringUtf8()); + } + } catch (KeyException e) { + assertEquals(ByteString.copyFromUtf8("d"), key); + LockInfo lock = e.getKeyError().getLocked(); + assertEquals(key, lock.getKey()); + assertEquals(ByteString.copyFromUtf8("z2"), lock.getPrimaryLock()); + } } } } From 420279acf78377f9f6cb6da6c3f7ef9bcfd59d6c Mon Sep 17 00:00:00 2001 From: Zhexuan Yang Date: Fri, 23 Aug 2019 14:30:33 +0800 Subject: [PATCH 56/62] using stream rather removeIf (#1057) --- core/src/test/scala/org/apache/spark/sql/ViewTestSuite.scala | 4 +++- .../src/main/java/com/pingcap/tikv/catalog/Catalog.java | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/sql/ViewTestSuite.scala b/core/src/test/scala/org/apache/spark/sql/ViewTestSuite.scala index b116534711..8319f12af3 100644 --- a/core/src/test/scala/org/apache/spark/sql/ViewTestSuite.scala +++ b/core/src/test/scala/org/apache/spark/sql/ViewTestSuite.scala @@ -20,5 +20,7 @@ class ViewTestSuite extends BaseTiSparkTest { judge(s"select * from $table") intercept[AnalysisException](spark.sql("select * from v")) + + spark.sql("show tables").show(false) } -} \ No newline at end of file +} diff --git a/tikv-client/src/main/java/com/pingcap/tikv/catalog/Catalog.java b/tikv-client/src/main/java/com/pingcap/tikv/catalog/Catalog.java index a0d093110e..1662d2b3e1 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/catalog/Catalog.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/catalog/Catalog.java @@ -86,8 +86,7 @@ public List listTables(TiDBInfo db) { tableMap = loadTables(db); } Collection tables = tableMap.values(); - tables.removeIf(TiTableInfo::isView); - return ImmutableList.copyOf(tables); + return tables.stream().filter(tbl -> !tbl.isView()).collect(Collectors.toList()); } public TiTableInfo getTable(TiDBInfo db, String tableName) { From b72a7b2038c42186c72f00e7a04b117bd5e874c8 Mon Sep 17 00:00:00 2001 From: birdstorm Date: Fri, 23 Aug 2019 20:58:54 +0800 Subject: [PATCH 57/62] Remove redundant pre-write/commit logic in LockResolverTest (#1062) --- .../main/java/com/pingcap/tikv/PDClient.java | 1 - .../com/pingcap/tikv/ReadOnlyPDClient.java | 3 - .../tikv/region/RegionStoreClient.java | 103 +++++------ .../pingcap/tikv/txn/LockResolverRCTest.java | 36 ++-- .../pingcap/tikv/txn/LockResolverSITest.java | 59 ++++--- .../pingcap/tikv/txn/LockResolverTest.java | 165 +++++------------- 6 files changed, 135 insertions(+), 232 deletions(-) diff --git a/tikv-client/src/main/java/com/pingcap/tikv/PDClient.java b/tikv-client/src/main/java/com/pingcap/tikv/PDClient.java index c17de7ad4e..330e34923c 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/PDClient.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/PDClient.java @@ -178,7 +178,6 @@ public Future getStoreAsync(BackOffer backOffer, long storeId) { public void close() throws InterruptedException { if (service != null) { service.shutdownNow(); - service.awaitTermination(1, TimeUnit.SECONDS); } if (channelFactory != null) { channelFactory.close(); diff --git a/tikv-client/src/main/java/com/pingcap/tikv/ReadOnlyPDClient.java b/tikv-client/src/main/java/com/pingcap/tikv/ReadOnlyPDClient.java index 6996e2212d..1be1bef691 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/ReadOnlyPDClient.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/ReadOnlyPDClient.java @@ -60,7 +60,4 @@ public interface ReadOnlyPDClient { Store getStore(BackOffer backOffer, long storeId); Future getStoreAsync(BackOffer backOffer, long storeId); - - /** Close underlining resources */ - void close() throws InterruptedException; } diff --git a/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java b/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java index a432bba1eb..5d520db708 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/region/RegionStoreClient.java @@ -48,13 +48,13 @@ import org.tikv.kvproto.TikvGrpc.TikvBlockingStub; import org.tikv.kvproto.TikvGrpc.TikvStub; -// Note that RegionStoreClient itself is not thread-safe // TODO: -// 1. RegionStoreClient will be inaccessible directly. -// 2. All apis of RegionStoreClient would not provide retry aside from callWithRetry, -// if a request needs to be retried because of an un-retryable cause, e.g., keys -// need to be re-split across regions/stores, region info outdated, e.t.c., you should -// retry it in an upper client logic (KVClient, TxnClient, e.t.c.) +// 1. RegionStoreClient will be inaccessible directly. +// 2. All apis of RegionStoreClient would not provide retry aside from callWithRetry, +// if a request needs to be retried because of an un-retryable cause, e.g., keys +// need to be re-split across regions/stores, region info outdated, e.t.c., you +// should retry it in an upper client logic (KVClient, TxnClient, e.t.c.) +/** Note that RegionStoreClient itself is not thread-safe */ public class RegionStoreClient extends AbstractRegionStoreClient { public enum RequestTypes { REQ_TYPE_SELECT(101), @@ -85,8 +85,8 @@ public int getValue() { * @param key key to fetch * @param version key version * @return value - * @throws TiClientInternalException - * @throws KeyException + * @throws TiClientInternalException TiSpark Client exception, unexpected + * @throws KeyException Key may be locked */ public ByteString get(BackOffer backOffer, ByteString key, long version) throws TiClientInternalException, KeyException { @@ -109,21 +109,16 @@ public ByteString get(BackOffer backOffer, ByteString key, long version) GetResponse resp = callWithRetry(backOffer, TikvGrpc.METHOD_KV_GET, factory, handler); - handleGetResponse(backOffer, resp); + handleGetResponse(resp); return resp.getValue(); } /** - * @param backOffer - * @param resp - * @return Return true means the rpc call success. Return false means the rpc call fail, - * RegionStoreClient should retry. Throw an Exception means the rpc call fail, - * RegionStoreClient cannot handle this kind of error. - * @throws TiClientInternalException - * @throws KeyException + * @param resp GetResponse + * @throws TiClientInternalException TiSpark Client exception, unexpected + * @throws KeyException Key may be locked */ - private void handleGetResponse(BackOffer backOffer, GetResponse resp) - throws TiClientInternalException, KeyException { + private void handleGetResponse(GetResponse resp) throws TiClientInternalException, KeyException { if (resp == null) { this.regionManager.onRequestFail(region); throw new TiClientInternalException("GetResponse failed without a cause"); @@ -157,7 +152,7 @@ public List batchGet(BackOffer backOffer, Iterable keys, lon return handleBatchGetResponse(backOffer, resp); } - private List handleBatchGetResponse(BackOffer bo, BatchGetResponse resp) { + private List handleBatchGetResponse(BackOffer backOffer, BatchGetResponse resp) { if (resp == null) { this.regionManager.onRequestFail(region); throw new TiClientInternalException("BatchGetResponse failed without a cause"); @@ -179,7 +174,7 @@ private List handleBatchGetResponse(BackOffer bo, BatchGetResponse resp) } if (!locks.isEmpty()) { - boolean ok = lockResolverClient.resolveLocks(bo, locks); + boolean ok = lockResolverClient.resolveLocks(backOffer, locks); if (!ok) { // resolveLocks already retried, just throw error to upper logic. throw new TiKVException("locks not resolved, retry"); @@ -261,14 +256,14 @@ public List scan(BackOffer backOffer, ByteString startKey, long version) /** * Prewrite batch keys * - * @param backOffer - * @param primary - * @param mutations - * @param startTs - * @param lockTTL - * @throws TiClientInternalException - * @throws KeyException - * @throws RegionException + * @param backOffer backOffer + * @param primary primary lock of keys + * @param mutations batch key-values as mutations + * @param startTs startTs of prewrite + * @param lockTTL lock ttl + * @throws TiClientInternalException TiSpark Client exception, unexpected + * @throws KeyException Key may be locked + * @throws RegionException region error occurs */ public void prewrite( BackOffer backOffer, @@ -283,21 +278,13 @@ public void prewrite( /** * Prewrite batch keys * - * @param bo - * @param primaryLock - * @param mutations - * @param startVersion - * @param ttl - * @param skipConstraintCheck - * @throws TiClientInternalException - * @throws KeyException - * @throws RegionException + * @param skipConstraintCheck whether to skip constraint check */ public void prewrite( BackOffer bo, ByteString primaryLock, Iterable mutations, - long startVersion, + long startTs, long ttl, boolean skipConstraintCheck) throws TiClientInternalException, KeyException, RegionException { @@ -306,7 +293,7 @@ public void prewrite( () -> PrewriteRequest.newBuilder() .setContext(region.getContext()) - .setStartVersion(startVersion) + .setStartVersion(startTs) .setPrimaryLock(primaryLock) .addAllMutations(mutations) .setLockTtl(ttl) @@ -328,8 +315,8 @@ public void prewrite( } /** - * @param backOffer - * @param resp + * @param backOffer backOffer + * @param resp response * @return Return true means the rpc call success. Return false means the rpc call fail, * RegionStoreClient should retry. Throw an Exception means the rpc call fail, * RegionStoreClient cannot handle this kind of error @@ -347,40 +334,42 @@ private boolean isPrewriteSuccess(BackOffer backOffer, PrewriteResponse resp) throw new RegionException(resp.getRegionError()); } - boolean result = true; + boolean isSuccess = true; List locks = new ArrayList<>(); for (KeyError err : resp.getErrorsList()) { if (err.hasLocked()) { - result = false; + isSuccess = false; Lock lock = new Lock(err.getLocked()); locks.add(lock); } else { throw new KeyException(err.toString()); } } + if (isSuccess) { + return true; + } if (!lockResolverClient.resolveLocks(backOffer, locks)) { backOffer.doBackOff(BoTxnLock, new KeyException(resp.getErrorsList().get(0))); } - return result; + return false; } /** * Commit batch keys * - * @param backOffer - * @param keys - * @param startVersion - * @param commitVersion + * @param backOffer backOffer + * @param keys keys to commit + * @param startTs start version + * @param commitTs commit version */ - public void commit( - BackOffer backOffer, Iterable keys, long startVersion, long commitVersion) + public void commit(BackOffer backOffer, Iterable keys, long startTs, long commitTs) throws KeyException { Supplier factory = () -> CommitRequest.newBuilder() - .setStartVersion(startVersion) - .setCommitVersion(commitVersion) + .setStartVersion(startTs) + .setCommitVersion(commitTs) .addAllKeys(keys) .setContext(region.getContext()) .build(); @@ -393,20 +382,16 @@ public void commit( resp -> resp.hasRegionError() ? resp.getRegionError() : null, resp -> resp.hasError() ? resp.getError() : null); CommitResponse resp = callWithRetry(backOffer, TikvGrpc.METHOD_KV_COMMIT, factory, handler); - handleCommitResponse(backOffer, resp); + handleCommitResponse(resp); } /** - * @param backOffer - * @param resp - * @return Return true means the rpc call success. Return false means the rpc call fail, - * RegionStoreClient should retry. Throw an Exception means the rpc call fail, - * RegionStoreClient cannot handle this kind of error + * @param resp CommitResponse * @throws TiClientInternalException * @throws RegionException * @throws KeyException */ - private void handleCommitResponse(BackOffer backOffer, CommitResponse resp) + private void handleCommitResponse(CommitResponse resp) throws TiClientInternalException, RegionException, KeyException { if (resp == null) { this.regionManager.onRequestFail(region); diff --git a/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverRCTest.java b/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverRCTest.java index 1238db9322..408e699194 100644 --- a/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverRCTest.java +++ b/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverRCTest.java @@ -24,6 +24,8 @@ import com.pingcap.tikv.meta.TiTimestamp; import com.pingcap.tikv.region.RegionStoreClient; import com.pingcap.tikv.region.TiRegion; +import com.pingcap.tikv.util.BackOffer; +import com.pingcap.tikv.util.ConcreteBackOffer; import java.util.Collections; import org.apache.log4j.Logger; import org.junit.Before; @@ -39,7 +41,6 @@ public void setUp() { conf.setIsolationLevel(IsolationLevel.RC); try { session = TiSession.getInstance(conf); - pdClient = session.getPDClient(); this.builder = session.getRegionStoreClientBuilder(); init = true; } catch (Exception e) { @@ -67,38 +68,39 @@ public void RCTest() { skipTest(); return; } - TiTimestamp startTs = pdClient.getTimestamp(backOffer); - TiTimestamp endTs = pdClient.getTimestamp(backOffer); + TiTimestamp startTs = session.getTimestamp(); + TiTimestamp endTs = session.getTimestamp(); // Put into kv putKV("a", "a", startTs.getVersion(), endTs.getVersion()); - startTs = pdClient.getTimestamp(backOffer); - endTs = pdClient.getTimestamp(backOffer); + startTs = session.getTimestamp(); + endTs = session.getTimestamp(); // Prewrite as primary without committing it assertTrue(lockKey("a", "aa", "a", "aa", false, startTs.getVersion(), endTs.getVersion())); TiRegion tiRegion = session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8("a")); RegionStoreClient client = builder.build(tiRegion); - // In RC mode, lock will not be read. is retrieved. - ByteString v = - client.get( - backOffer, ByteString.copyFromUtf8("a"), pdClient.getTimestamp(backOffer).getVersion()); - assertEquals(v.toStringUtf8(), "a"); + + { + BackOffer backOffer = ConcreteBackOffer.newGetBackOff(); + // In RC mode, lock will not be read. is retrieved. + ByteString v = + client.get(backOffer, ByteString.copyFromUtf8("a"), session.getTimestamp().getVersion()); + assertEquals(v.toStringUtf8(), "a"); + } try { // After committing , we can read it. assertTrue( commit( + Collections.singletonList(ByteString.copyFromUtf8("a")), startTs.getVersion(), - endTs.getVersion(), - Collections.singletonList(ByteString.copyFromUtf8("a")))); - v = - client.get( - backOffer, - ByteString.copyFromUtf8("a"), - pdClient.getTimestamp(backOffer).getVersion()); + endTs.getVersion())); + BackOffer backOffer = ConcreteBackOffer.newGetBackOff(); + ByteString v = + client.get(backOffer, ByteString.copyFromUtf8("a"), session.getTimestamp().getVersion()); assertEquals(v.toStringUtf8(), "aa"); } catch (KeyException e) { fail(); diff --git a/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverSITest.java b/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverSITest.java index 661ab4183b..43b545ac0e 100644 --- a/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverSITest.java +++ b/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverSITest.java @@ -24,6 +24,8 @@ import com.pingcap.tikv.meta.TiTimestamp; import com.pingcap.tikv.region.RegionStoreClient; import com.pingcap.tikv.region.TiRegion; +import com.pingcap.tikv.util.BackOffer; +import com.pingcap.tikv.util.ConcreteBackOffer; import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -42,7 +44,6 @@ public void setUp() { TiSession.clearCache(); try { session = TiSession.getInstance(conf); - pdClient = session.getPDClient(); this.builder = session.getRegionStoreClientBuilder(); init = true; } catch (Exception e) { @@ -72,8 +73,8 @@ public void cleanLockTest() { } for (int i = 0; i < 26; i++) { String k = String.valueOf((char) ('a' + i)); - TiTimestamp startTs = pdClient.getTimestamp(backOffer); - TiTimestamp endTs = pdClient.getTimestamp(backOffer); + TiTimestamp startTs = session.getTimestamp(); + TiTimestamp endTs = session.getTimestamp(); assertTrue(lockKey(k, k, k, k, false, startTs.getVersion(), endTs.getVersion())); } @@ -92,12 +93,12 @@ public void cleanLockTest() { keys.add(ByteString.copyFromUtf8(k)); } - TiTimestamp startTs = pdClient.getTimestamp(backOffer); - TiTimestamp endTs = pdClient.getTimestamp(backOffer); + TiTimestamp startTs = session.getTimestamp(); + TiTimestamp endTs = session.getTimestamp(); boolean res = prewrite(mutations, startTs.getVersion(), mutations.get(0)); assertTrue(res); - res = commit(startTs.getVersion(), endTs.getVersion(), keys); + res = commit(keys, startTs.getVersion(), endTs.getVersion()); assertTrue(res); for (int i = 0; i < 26; i++) { @@ -106,11 +107,12 @@ public void cleanLockTest() { .getRegionManager() .getRegionByKey(ByteString.copyFromUtf8(String.valueOf((char) ('a' + i)))); RegionStoreClient client = builder.build(tiRegion); + BackOffer backOffer = ConcreteBackOffer.newGetBackOff(); ByteString v = client.get( backOffer, ByteString.copyFromUtf8(String.valueOf((char) ('a' + i))), - pdClient.getTimestamp(backOffer).getVersion()); + session.getTimestamp().getVersion()); assertEquals(v.toStringUtf8(), String.valueOf((char) ('a' + i + 1))); } } @@ -121,19 +123,20 @@ public void txnStatusTest() { skipTest(); return; } - TiTimestamp startTs = pdClient.getTimestamp(backOffer); - TiTimestamp endTs = pdClient.getTimestamp(backOffer); + TiTimestamp startTs = session.getTimestamp(); + TiTimestamp endTs = session.getTimestamp(); putKV("a", "a", startTs.getVersion(), endTs.getVersion()); TiRegion tiRegion = session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8("a")); RegionStoreClient client = builder.build(tiRegion); + BackOffer backOffer = ConcreteBackOffer.newCustomBackOff(BackOffer.CLEANUP_MAX_BACKOFF); long status = client.lockResolverClient.getTxnStatus( backOffer, startTs.getVersion(), ByteString.copyFromUtf8("a")); assertEquals(status, endTs.getVersion()); - startTs = pdClient.getTimestamp(backOffer); - endTs = pdClient.getTimestamp(backOffer); + startTs = session.getTimestamp(); + endTs = session.getTimestamp(); assertTrue(lockKey("a", "a", "a", "a", true, startTs.getVersion(), endTs.getVersion())); tiRegion = session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8("a")); @@ -143,8 +146,8 @@ public void txnStatusTest() { backOffer, startTs.getVersion(), ByteString.copyFromUtf8("a")); assertEquals(status, endTs.getVersion()); - startTs = pdClient.getTimestamp(backOffer); - endTs = pdClient.getTimestamp(backOffer); + startTs = session.getTimestamp(); + endTs = session.getTimestamp(); assertTrue(lockKey("a", "a", "a", "a", false, startTs.getVersion(), endTs.getVersion())); tiRegion = session.getRegionManager().getRegionByKey(ByteString.copyFromUtf8("a")); @@ -161,14 +164,14 @@ public void SITest() { skipTest(); return; } - TiTimestamp startTs = pdClient.getTimestamp(backOffer); - TiTimestamp endTs = pdClient.getTimestamp(backOffer); + TiTimestamp startTs = session.getTimestamp(); + TiTimestamp endTs = session.getTimestamp(); // Put into kv putKV("a", "a", startTs.getVersion(), endTs.getVersion()); - startTs = pdClient.getTimestamp(backOffer); - endTs = pdClient.getTimestamp(backOffer); + startTs = session.getTimestamp(); + endTs = session.getTimestamp(); // Prewrite as primary without committing it assertTrue(lockKey("a", "aa", "a", "aa", false, startTs.getVersion(), endTs.getVersion())); @@ -177,27 +180,29 @@ public void SITest() { RegionStoreClient client = builder.build(tiRegion); try { + BackOffer backOffer = ConcreteBackOffer.newGetBackOff(); // In SI mode, a lock is read. Try resolve it if expires TTL. - client.get( - backOffer, ByteString.copyFromUtf8("a"), pdClient.getTimestamp(backOffer).getVersion()); + client.get(backOffer, ByteString.copyFromUtf8("a"), session.getTimestamp().getVersion()); fail(); } catch (KeyException e) { assertEquals(ByteString.copyFromUtf8("a"), e.getKeyError().getLocked().getKey()); } - // With TTL set to 10, after 10 milliseconds is resolved. - // We should be able to read instead. - ByteString v = - client.get( - backOffer, ByteString.copyFromUtf8("a"), pdClient.getTimestamp(backOffer).getVersion()); - assertEquals(v.toStringUtf8(), String.valueOf('a')); + { + BackOffer backOffer = ConcreteBackOffer.newGetBackOff(); + // With TTL set to 10, after 10 milliseconds is resolved. + // We should be able to read instead. + ByteString v = + client.get(backOffer, ByteString.copyFromUtf8("a"), session.getTimestamp().getVersion()); + assertEquals(v.toStringUtf8(), String.valueOf('a')); + } try { // Trying to continue the commit phase of will fail because TxnLockNotFound commit( + Collections.singletonList(ByteString.copyFromUtf8("a")), startTs.getVersion(), - endTs.getVersion(), - Collections.singletonList(ByteString.copyFromUtf8("a"))); + endTs.getVersion()); fail(); } catch (KeyException e) { assertFalse(e.getKeyError().getRetryable().isEmpty()); diff --git a/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverTest.java b/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverTest.java index 24e64209b4..5d69e093e9 100644 --- a/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverTest.java +++ b/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverTest.java @@ -15,37 +15,30 @@ package com.pingcap.tikv.txn; -import static com.pingcap.tikv.util.BackOffFunction.BackOffFuncType.BoTxnLock; import static junit.framework.TestCase.*; import com.google.protobuf.ByteString; -import com.pingcap.tikv.ReadOnlyPDClient; import com.pingcap.tikv.TiSession; import com.pingcap.tikv.exception.KeyException; import com.pingcap.tikv.exception.RegionException; import com.pingcap.tikv.meta.TiTimestamp; -import com.pingcap.tikv.operation.KVErrorHandler; import com.pingcap.tikv.region.RegionStoreClient; import com.pingcap.tikv.region.TiRegion; +import com.pingcap.tikv.util.BackOffFunction; import com.pingcap.tikv.util.BackOffer; import com.pingcap.tikv.util.ConcreteBackOffer; -import java.lang.reflect.Field; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.function.Supplier; import org.apache.log4j.Logger; import org.junit.Before; import org.tikv.kvproto.Kvrpcpb.*; -import org.tikv.kvproto.TikvGrpc; public abstract class LockResolverTest { private final Logger logger = Logger.getLogger(this.getClass()); TiSession session; private static final int DefaultTTL = 10; - BackOffer backOffer = ConcreteBackOffer.newCustomBackOff(1000); - ReadOnlyPDClient pdClient; RegionStoreClient.RegionStoreClientBuilder builder; boolean init; @@ -62,87 +55,46 @@ void putKV(String key, String value, long startTS, long commitTS) { boolean res = prewrite(Collections.singletonList(m), startTS, m); assertTrue(res); - res = commit(startTS, commitTS, Collections.singletonList(ByteString.copyFromUtf8(key))); + res = commit(Collections.singletonList(ByteString.copyFromUtf8(key)), startTS, commitTS); assertTrue(res); } boolean prewrite(List mutations, long startTS, Mutation primary) { if (mutations.size() == 0) return true; + BackOffer backOffer = ConcreteBackOffer.newCustomBackOff(1000); - /*for (Mutation m : mutations) { + for (Mutation m : mutations) { while (true) { try { TiRegion region = session.getRegionManager().getRegionByKey(m.getKey()); RegionStoreClient client = builder.build(region); - client.prewrite(backOffer, primary.getKey(), mutations, startTS, DefaultTTL); + client.prewrite( + backOffer, primary.getKey(), Collections.singletonList(m), startTS, DefaultTTL); break; - } catch (Exception e) { - logger.warn(e.getMessage()); + } catch (RegionException e) { + backOffer.doBackOff(BackOffFunction.BackOffFuncType.BoRegionMiss, e); } } - }*/ - for (Mutation m : mutations) { - TiRegion region = session.getRegionManager().getRegionByKey(m.getKey()); - RegionStoreClient client = builder.build(region); - - Supplier factory = - () -> - PrewriteRequest.newBuilder() - .addAllMutations(Collections.singletonList(m)) - .setPrimaryLock(primary.getKey()) - .setStartVersion(startTS) - .setLockTtl(DefaultTTL) - .setContext(region.getContext()) - .build(); - - KVErrorHandler handler = - new KVErrorHandler<>( - session.getRegionManager(), - client, - client.lockResolverClient, - region, - resp -> resp.hasRegionError() ? resp.getRegionError() : null, - resp -> null); - PrewriteResponse resp = - client.callWithRetry(backOffer, TikvGrpc.METHOD_KV_PREWRITE, factory, handler); - - if (resp.hasRegionError()) { - throw new RegionException(resp.getRegionError()); - } + } + return true; + } - if (resp.getErrorsCount() == 0) { - continue; - } + boolean commit(List keys, long startTS, long commitTS) { + if (keys.size() == 0) return true; + BackOffer backOffer = ConcreteBackOffer.newCustomBackOff(1000); - List locks = new ArrayList<>(); - for (KeyError err : resp.getErrorsList()) { - if (err.hasLocked()) { - Lock lock = new Lock(err.getLocked()); - locks.add(lock); - } else { - throw new KeyException(err); + for (ByteString k : keys) { + while (true) { + try { + TiRegion tiRegion = session.getRegionManager().getRegionByKey(k); + RegionStoreClient client = builder.build(tiRegion); + client.commit(backOffer, Collections.singletonList(k), startTS, commitTS); + break; + } catch (RegionException e) { + backOffer.doBackOff(BackOffFunction.BackOffFuncType.BoRegionMiss, e); } } - - LockResolverClient resolver = null; - try { - Field field = RegionStoreClient.class.getDeclaredField("lockResolverClient"); - assert (field != null); - field.setAccessible(true); - resolver = (LockResolverClient) (field.get(client)); - } catch (Exception e) { - fail(); - } - - assertNotNull(resolver); - - if (!resolver.resolveLocks(backOffer, locks)) { - backOffer.doBackOff(BoTxnLock, new KeyException(resp.getErrorsList().get(0))); - } - - prewrite(Collections.singletonList(m), startTS, primary); } - return true; } @@ -174,62 +126,24 @@ boolean lockKey( if (commitPrimary) { if (!key.equals(primaryKey)) { return commit( + Arrays.asList(ByteString.copyFromUtf8(primaryKey), ByteString.copyFromUtf8(key)), startTs, - commitTS, - Arrays.asList(ByteString.copyFromUtf8(primaryKey), ByteString.copyFromUtf8(key))); + commitTS); } else { return commit( - startTs, commitTS, Collections.singletonList(ByteString.copyFromUtf8(primaryKey))); + Collections.singletonList(ByteString.copyFromUtf8(primaryKey)), startTs, commitTS); } } return true; } - boolean commit(long startTS, long commitTS, List keys) { - if (keys.size() == 0) return true; - - for (ByteString k : keys) { - TiRegion tiRegion = session.getRegionManager().getRegionByKey(k); - - RegionStoreClient client = builder.build(tiRegion); - Supplier factory = - () -> - CommitRequest.newBuilder() - .setStartVersion(startTS) - .setCommitVersion(commitTS) - .addAllKeys(Collections.singletonList(k)) - .setContext(tiRegion.getContext()) - .build(); - - KVErrorHandler handler = - new KVErrorHandler<>( - session.getRegionManager(), - client, - client.lockResolverClient, - tiRegion, - resp -> resp.hasRegionError() ? resp.getRegionError() : null, - resp -> resp.hasError() ? resp.getError() : null); - CommitResponse resp = - client.callWithRetry(backOffer, TikvGrpc.METHOD_KV_COMMIT, factory, handler); - - if (resp.hasRegionError()) { - throw new RegionException(resp.getRegionError()); - } - - if (resp.hasError()) { - throw new KeyException(resp.getError()); - } - } - return true; - } - void putAlphabet() { for (int i = 0; i < 26; i++) { - long startTs = pdClient.getTimestamp(backOffer).getVersion(); - long endTs = pdClient.getTimestamp(backOffer).getVersion(); + long startTs = session.getTimestamp().getVersion(); + long endTs = session.getTimestamp().getVersion(); while (startTs == endTs) { - endTs = pdClient.getTimestamp(backOffer).getVersion(); + endTs = session.getTimestamp().getVersion(); } putKV(String.valueOf((char) ('a' + i)), String.valueOf((char) ('a' + i)), startTs, endTs); } @@ -237,23 +151,23 @@ void putAlphabet() { } void prepareAlphabetLocks() { - TiTimestamp startTs = pdClient.getTimestamp(backOffer); - TiTimestamp endTs = pdClient.getTimestamp(backOffer); + TiTimestamp startTs = session.getTimestamp(); + TiTimestamp endTs = session.getTimestamp(); while (startTs == endTs) { - endTs = pdClient.getTimestamp(backOffer); + endTs = session.getTimestamp(); } putKV("c", "cc", startTs.getVersion(), endTs.getVersion()); - startTs = pdClient.getTimestamp(backOffer); - endTs = pdClient.getTimestamp(backOffer); + startTs = session.getTimestamp(); + endTs = session.getTimestamp(); while (startTs == endTs) { - endTs = pdClient.getTimestamp(backOffer); + endTs = session.getTimestamp(); } assertTrue(lockKey("c", "c", "z1", "z1", true, startTs.getVersion(), endTs.getVersion())); - startTs = pdClient.getTimestamp(backOffer); - endTs = pdClient.getTimestamp(backOffer); + startTs = session.getTimestamp(); + endTs = session.getTimestamp(); while (startTs == endTs) { - endTs = pdClient.getTimestamp(backOffer); + endTs = session.getTimestamp(); } assertTrue(lockKey("d", "dd", "z2", "z2", false, startTs.getVersion(), endTs.getVersion())); } @@ -271,8 +185,9 @@ void versionTest(boolean hasLock) { ByteString key = ByteString.copyFromUtf8(String.valueOf((char) ('a' + i))); TiRegion tiRegion = session.getRegionManager().getRegionByKey(key); RegionStoreClient client = builder.build(tiRegion); + BackOffer backOffer = ConcreteBackOffer.newGetBackOff(); try { - ByteString v = client.get(backOffer, key, pdClient.getTimestamp(backOffer).getVersion()); + ByteString v = client.get(backOffer, key, session.getTimestamp().getVersion()); if (hasLock && i == 3) { // key "d" should be locked fail(); From f578c3beb9162a89f283fd7b441a8e7832c8aa98 Mon Sep 17 00:00:00 2001 From: Zhexuan Yang Date: Mon, 26 Aug 2019 16:19:01 +0800 Subject: [PATCH 58/62] adding recreate flag when create tisession (#1064) --- .../scala/org/apache/spark/sql/TiContext.scala | 2 +- .../datasource/BaseDataSourceTest.scala | 2 -- .../main/java/com/pingcap/tikv/TiSession.java | 18 ++++++++---------- .../pingcap/tikv/txn/LockResolverSITest.java | 1 - 4 files changed, 9 insertions(+), 14 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/sql/TiContext.scala b/core/src/main/scala/org/apache/spark/sql/TiContext.scala index 4be81bb05e..b45bf8d540 100644 --- a/core/src/main/scala/org/apache/spark/sql/TiContext.scala +++ b/core/src/main/scala/org/apache/spark/sql/TiContext.scala @@ -43,7 +43,7 @@ class TiContext(val sparkSession: SparkSession, options: Option[TiDBOptions] = N lazy val sqlContext: SQLContext = sparkSession.sqlContext val conf: SparkConf = mergeWithDataSourceConfig(sparkSession.sparkContext.conf, options) val tiConf: TiConfiguration = TiUtil.sparkConfToTiConf(conf) - val tiSession: TiSession = TiSession.getInstance(tiConf) + val tiSession: TiSession = TiSession.getInstance(tiConf, true) val meta: MetaManager = new MetaManager(tiSession.getCatalog) StatisticsManager.initStatisticsManager(tiSession) diff --git a/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala b/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala index 4a154ac475..1c53e776a0 100644 --- a/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala +++ b/core/src/test/scala/com/pingcap/tispark/datasource/BaseDataSourceTest.scala @@ -36,8 +36,6 @@ class BaseDataSourceTest(val table: String, protected def dropTable(): Unit = { jdbcUpdate(s"drop table if exists $dbtable") - // If we reuse tiSession, cache in catalog will be outdated after dropping and creating table. - TiSession.clearCache() } protected def dropTable(tblName: String): Unit = { diff --git a/tikv-client/src/main/java/com/pingcap/tikv/TiSession.java b/tikv-client/src/main/java/com/pingcap/tikv/TiSession.java index dfbc2f28a5..ef2546c9d5 100644 --- a/tikv-client/src/main/java/com/pingcap/tikv/TiSession.java +++ b/tikv-client/src/main/java/com/pingcap/tikv/TiSession.java @@ -45,12 +45,10 @@ public class TiSession implements AutoCloseable { private static final Map sessionCachedMap = new HashMap<>(); - // Since we create session as singleton now, configuration change will not - // reflect change - public static TiSession getInstance(TiConfiguration conf) { + public static TiSession getInstance(TiConfiguration conf, boolean recreate) { synchronized (sessionCachedMap) { String key = conf.getPdAddrsString(); - if (sessionCachedMap.containsKey(key)) { + if (sessionCachedMap.containsKey(key) && !recreate) { return sessionCachedMap.get(key); } @@ -60,6 +58,12 @@ public static TiSession getInstance(TiConfiguration conf) { } } + // Since we create session as singleton now, configuration change will not + // reflect change + public static TiSession getInstance(TiConfiguration conf) { + return getInstance(conf, false); + } + private TiSession(TiConfiguration conf) { this.conf = conf; this.channelFactory = new ChannelFactory(conf.getMaxFrameSize()); @@ -188,12 +192,6 @@ public void injectCallBackFunc(Function callBackFunc this.cacheInvalidateCallback = callBackFunc; } - public static void clearCache() { - synchronized (sessionCachedMap) { - sessionCachedMap.clear(); - } - } - @Override public synchronized void close() throws Exception { synchronized (sessionCachedMap) { diff --git a/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverSITest.java b/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverSITest.java index 43b545ac0e..1567ce53df 100644 --- a/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverSITest.java +++ b/tikv-client/src/test/java/com/pingcap/tikv/txn/LockResolverSITest.java @@ -41,7 +41,6 @@ public class LockResolverSITest extends LockResolverTest { public void setUp() { TiConfiguration conf = TiConfiguration.createDefault("127.0.0.1:2379"); conf.setIsolationLevel(IsolationLevel.SI); - TiSession.clearCache(); try { session = TiSession.getInstance(conf); this.builder = session.getRegionStoreClientBuilder(); From 8bebb76cfdcb1e9c87df57aa4ec11c34316793ac Mon Sep 17 00:00:00 2001 From: Zhexuan Yang Date: Mon, 26 Aug 2019 20:53:05 +0800 Subject: [PATCH 59/62] fix issue 1047 (#1066) --- .../sql/test/generator/ColumnValueGenerator.scala | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/sql/test/generator/ColumnValueGenerator.scala b/core/src/test/scala/org/apache/spark/sql/test/generator/ColumnValueGenerator.scala index 9cf36ecbb7..03cbf0d69f 100644 --- a/core/src/test/scala/org/apache/spark/sql/test/generator/ColumnValueGenerator.scala +++ b/core/src/test/scala/org/apache/spark/sql/test/generator/ColumnValueGenerator.scala @@ -236,7 +236,7 @@ case class ColumnValueGenerator(dataType: ReflectedDataType, generatedRandomValues = if (generateUnique) { assert(n <= rangeSize, "random generator cannot generate unique value less than available") val set: mutable.Set[Any] = mutable.HashSet.empty[Any] - set += specialBound.map(TestDataGenerator.hash) + set ++= specialBound.map(TestDataGenerator.hash) (0L until n - specialBound.size).map { _ => randomUniqueValue(r, set) }.toList ++ specialBound @@ -245,9 +245,16 @@ case class ColumnValueGenerator(dataType: ReflectedDataType, randomValue(r) }.toList ++ specialBound } + + val expectedGeneratedRandomValuesLen = if (generateUnique) { + generatedRandomValues.toSet.size + } else { + generatedRandomValues.size + } + assert( - generatedRandomValues.size >= n, - s"Generate values size=$generatedRandomValues less than n=$n" + expectedGeneratedRandomValuesLen >= n, + s"Generate values size=$generatedRandomValues less than n=$n on datatype $dataType" ) curPos = 0 } From 927cacf07c0c80fe3dedb5c6415cc790c4f9750d Mon Sep 17 00:00:00 2001 From: Zhexuan Yang Date: Mon, 26 Aug 2019 21:59:33 +0800 Subject: [PATCH 60/62] cleanup code in TiBatchWrite (#1067) --- core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala b/core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala index c364571eeb..ed24c0b749 100644 --- a/core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala +++ b/core/src/main/scala/com/pingcap/tispark/TiBatchWrite.scala @@ -18,7 +18,6 @@ package com.pingcap.tispark import java.util import com.pingcap.tikv.allocator.RowIDAllocator -import com.pingcap.tikv.catalog.Catalog import com.pingcap.tikv.codec.{CodecDataOutput, KeyUtils, TableCodec} import com.pingcap.tikv.exception.TiBatchWriteException import com.pingcap.tikv.key.{IndexKey, Key, RowKey} @@ -66,7 +65,6 @@ class TiBatchWrite(@transient val df: DataFrame, private var tiConf: TiConfiguration = _ @transient private var tiSession: TiSession = _ - @transient private var catalog: Catalog = _ private var tiTableRef: TiTableReference = _ private var tiDBInfo: TiDBInfo = _ @@ -128,7 +126,6 @@ class TiBatchWrite(@transient val df: DataFrame, tiTableRef = options.tiTableRef tiDBInfo = tiSession.getCatalog.getDatabase(tiTableRef.databaseName) tiTableInfo = tiSession.getCatalog.getTable(tiTableRef.databaseName, tiTableRef.tableName) - catalog = TiSession.getInstance(tiConf).getCatalog if (tiTableInfo == null) { throw new NoSuchTableException(tiTableRef.databaseName, tiTableRef.tableName) @@ -410,7 +407,7 @@ class TiBatchWrite(@transient val df: DataFrame, .create( tiDBInfo.getId, tiTableInfo.getId, - tiSession.getConf, + tiConf, tiTableInfo.isAutoIncColUnsigned, step ) From b402ade19d6bffb7dae49db2f7bf88c71bd78c09 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Tue, 27 Aug 2019 17:40:36 +0800 Subject: [PATCH 61/62] release tispark-2.1.4 (#1068) (#1069) (cherry picked from commit fd8068a31f4a3bf6c97a45070ff1f588859fb66e) --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c71dbb11c7..afd976947a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ # TiSpark Changelog All notable changes to this project will be documented in this file. +## [TiSpark 2.1.4] 2019-08-27 +### Fixes +- Fix distinct without alias bug: disable pushdown aggregate with alias [#1055](https://github.com/pingcap/tispark/pull/1055) +- Fix reflection bug: pass in different arguments for different version of same function [#1037](https://github.com/pingcap/tispark/pull/1037) + ## [TiSpark 2.1.3] 2019-08-15 ### Fixes - Fix cost model in table scan [#1023](https://github.com/pingcap/tispark/pull/1023) From 8888d30e43d87bd87b75fa58894fd3a26ba20f71 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Tue, 27 Aug 2019 17:48:07 +0800 Subject: [PATCH 62/62] update document for tispark-2.1.4 release (#1070) --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 8cc6fd8175..7b77e65bbe 100755 --- a/README.md +++ b/README.md @@ -17,9 +17,9 @@ The latest stable version compatible with **Spark 2.1.0+** is **TiSpark 1.2.1** **When using TiSpark 1.2.1, please follow the [document for Spark 2.1](./docs/userguide_spark2.1.md)** -**When using TiSpark 2.1.2 with Spark 2.3.0+, please use version `2.1.2-spark_2.3` and follow the [document for Spark 2.3+](./docs/userguide.md)** +**When using TiSpark 2.1.4 Spark 2.3.0+, please use version `2.1.4-spark_2.3` and follow the [document for Spark 2.3+](./docs/userguide.md)** -**When using TiSpark 2.1.2 with Spark 2.4.0+, please use version `2.1.2-spark_2.4` and follow the [document for Spark 2.3+](./docs/userguide.md)** +**When using TiSpark 2.1.4 with Spark 2.4.0+, please use version `2.1.4-spark_2.4` and follow the [document for Spark 2.3+](./docs/userguide.md)** You may also [build from sources](#how-to-build-from-sources) to try the new features on TiSpark master branch. @@ -29,7 +29,7 @@ If you are using maven(recommended), add the following to your pom.xml: com.pingcap.tispark tispark-core - 2.1.2-spark_${spark.version} + 2.1.4-spark_${spark.version} ``` @@ -55,8 +55,8 @@ Remember to add `-Dmaven.test.skip=true` to skip all the tests if you don't need | Spark Version | Stable TiSpark Version | | ------------- | ---------------------- | -| Spark-2.4.x | TiSpark-2.1.2 | -| Spark-2.3.x | TiSpark-2.1.2 | +| Spark-2.4.x | TiSpark-2.1.4 | +| Spark-2.3.x | TiSpark-2.1.4 | | Spark-2.2.x | TiSpark-1.2.1 | | Spark-2.1.x | TiSpark-1.2.1 |