Skip to content

Commit

Permalink
sql: implement pgvector datatype and evaluation
Browse files Browse the repository at this point in the history
Release note (sql change): implement pgvector encoding, decoding, and
operators, without index acceleration.
  • Loading branch information
jordanlewis committed May 17, 2024
1 parent 6a45828 commit 7661327
Show file tree
Hide file tree
Showing 74 changed files with 1,705 additions and 67 deletions.
12 changes: 11 additions & 1 deletion docs/generated/sql/bnf/stmt_block.bnf
Original file line number Diff line number Diff line change
Expand Up @@ -1564,6 +1564,7 @@ col_name_keyword ::=
| 'VALUES'
| 'VARBIT'
| 'VARCHAR'
| 'VECTOR'
| 'VIRTUAL'
| 'WORK'

Expand Down Expand Up @@ -1751,7 +1752,7 @@ backup_options_list ::=
( backup_options ) ( ( ',' backup_options ) )*

a_expr ::=
( c_expr | '+' a_expr | '-' a_expr | '~' a_expr | 'SQRT' a_expr | 'CBRT' a_expr | qual_op a_expr | 'NOT' a_expr | 'NOT' a_expr | row 'OVERLAPS' row | 'DEFAULT' ) ( ( 'TYPECAST' cast_target | 'TYPEANNOTATE' typename | 'COLLATE' collation_name | 'AT' 'TIME' 'ZONE' a_expr | '+' a_expr | '-' a_expr | '*' a_expr | '/' a_expr | 'FLOORDIV' a_expr | '%' a_expr | '^' a_expr | '#' a_expr | '&' a_expr | '|' a_expr | '<' a_expr | '>' a_expr | '?' a_expr | 'JSON_SOME_EXISTS' a_expr | 'JSON_ALL_EXISTS' a_expr | 'CONTAINS' a_expr | 'CONTAINED_BY' a_expr | '=' a_expr | 'CONCAT' a_expr | 'LSHIFT' a_expr | 'RSHIFT' a_expr | 'FETCHVAL' a_expr | 'FETCHTEXT' a_expr | 'FETCHVAL_PATH' a_expr | 'FETCHTEXT_PATH' a_expr | 'REMOVE_PATH' a_expr | 'INET_CONTAINED_BY_OR_EQUALS' a_expr | 'AND_AND' a_expr | 'AT_AT' a_expr | 'INET_CONTAINS_OR_EQUALS' a_expr | 'LESS_EQUALS' a_expr | 'GREATER_EQUALS' a_expr | 'NOT_EQUALS' a_expr | qual_op a_expr | 'AND' a_expr | 'OR' a_expr | 'LIKE' a_expr | 'LIKE' a_expr 'ESCAPE' a_expr | 'NOT' 'LIKE' a_expr | 'NOT' 'LIKE' a_expr 'ESCAPE' a_expr | 'ILIKE' a_expr | 'ILIKE' a_expr 'ESCAPE' a_expr | 'NOT' 'ILIKE' a_expr | 'NOT' 'ILIKE' a_expr 'ESCAPE' a_expr | 'SIMILAR' 'TO' a_expr | 'SIMILAR' 'TO' a_expr 'ESCAPE' a_expr | 'NOT' 'SIMILAR' 'TO' a_expr | 'NOT' 'SIMILAR' 'TO' a_expr 'ESCAPE' a_expr | '~' a_expr | 'NOT_REGMATCH' a_expr | 'REGIMATCH' a_expr | 'NOT_REGIMATCH' a_expr | 'IS' 'NAN' | 'IS' 'NOT' 'NAN' | 'IS' 'NULL' | 'ISNULL' | 'IS' 'NOT' 'NULL' | 'NOTNULL' | 'IS' 'TRUE' | 'IS' 'NOT' 'TRUE' | 'IS' 'FALSE' | 'IS' 'NOT' 'FALSE' | 'IS' 'UNKNOWN' | 'IS' 'NOT' 'UNKNOWN' | 'IS' 'DISTINCT' 'FROM' a_expr | 'IS' 'NOT' 'DISTINCT' 'FROM' a_expr | 'IS' 'OF' '(' type_list ')' | 'IS' 'NOT' 'OF' '(' type_list ')' | 'BETWEEN' opt_asymmetric b_expr 'AND' a_expr | 'NOT' 'BETWEEN' opt_asymmetric b_expr 'AND' a_expr | 'BETWEEN' 'SYMMETRIC' b_expr 'AND' a_expr | 'NOT' 'BETWEEN' 'SYMMETRIC' b_expr 'AND' a_expr | 'IN' in_expr | 'NOT' 'IN' in_expr | subquery_op sub_type a_expr ) )*
( c_expr | '+' a_expr | '-' a_expr | '~' a_expr | 'SQRT' a_expr | 'CBRT' a_expr | qual_op a_expr | 'NOT' a_expr | 'NOT' a_expr | row 'OVERLAPS' row | 'DEFAULT' ) ( ( 'TYPECAST' cast_target | 'TYPEANNOTATE' typename | 'COLLATE' collation_name | 'AT' 'TIME' 'ZONE' a_expr | '+' a_expr | '-' a_expr | '*' a_expr | '/' a_expr | 'FLOORDIV' a_expr | '%' a_expr | '^' a_expr | '#' a_expr | '&' a_expr | '|' a_expr | '<' a_expr | '>' a_expr | '?' a_expr | 'JSON_SOME_EXISTS' a_expr | 'JSON_ALL_EXISTS' a_expr | 'CONTAINS' a_expr | 'CONTAINED_BY' a_expr | '=' a_expr | 'CONCAT' a_expr | 'LSHIFT' a_expr | 'RSHIFT' a_expr | 'FETCHVAL' a_expr | 'FETCHTEXT' a_expr | 'FETCHVAL_PATH' a_expr | 'FETCHTEXT_PATH' a_expr | 'REMOVE_PATH' a_expr | 'INET_CONTAINED_BY_OR_EQUALS' a_expr | 'AND_AND' a_expr | 'AT_AT' a_expr | 'DISTANCE' a_expr | 'COS_DISTANCE' a_expr | 'NEG_INNER_PRODUCT' a_expr | 'INET_CONTAINS_OR_EQUALS' a_expr | 'LESS_EQUALS' a_expr | 'GREATER_EQUALS' a_expr | 'NOT_EQUALS' a_expr | qual_op a_expr | 'AND' a_expr | 'OR' a_expr | 'LIKE' a_expr | 'LIKE' a_expr 'ESCAPE' a_expr | 'NOT' 'LIKE' a_expr | 'NOT' 'LIKE' a_expr 'ESCAPE' a_expr | 'ILIKE' a_expr | 'ILIKE' a_expr 'ESCAPE' a_expr | 'NOT' 'ILIKE' a_expr | 'NOT' 'ILIKE' a_expr 'ESCAPE' a_expr | 'SIMILAR' 'TO' a_expr | 'SIMILAR' 'TO' a_expr 'ESCAPE' a_expr | 'NOT' 'SIMILAR' 'TO' a_expr | 'NOT' 'SIMILAR' 'TO' a_expr 'ESCAPE' a_expr | '~' a_expr | 'NOT_REGMATCH' a_expr | 'REGIMATCH' a_expr | 'NOT_REGIMATCH' a_expr | 'IS' 'NAN' | 'IS' 'NOT' 'NAN' | 'IS' 'NULL' | 'ISNULL' | 'IS' 'NOT' 'NULL' | 'NOTNULL' | 'IS' 'TRUE' | 'IS' 'NOT' 'TRUE' | 'IS' 'FALSE' | 'IS' 'NOT' 'FALSE' | 'IS' 'UNKNOWN' | 'IS' 'NOT' 'UNKNOWN' | 'IS' 'DISTINCT' 'FROM' a_expr | 'IS' 'NOT' 'DISTINCT' 'FROM' a_expr | 'IS' 'OF' '(' type_list ')' | 'IS' 'NOT' 'OF' '(' type_list ')' | 'BETWEEN' opt_asymmetric b_expr 'AND' a_expr | 'NOT' 'BETWEEN' opt_asymmetric b_expr 'AND' a_expr | 'BETWEEN' 'SYMMETRIC' b_expr 'AND' a_expr | 'NOT' 'BETWEEN' 'SYMMETRIC' b_expr 'AND' a_expr | 'IN' in_expr | 'NOT' 'IN' in_expr | subquery_op sub_type a_expr ) )*

for_schedules_clause ::=
'FOR' 'SCHEDULES' select_stmt
Expand Down Expand Up @@ -3139,6 +3140,9 @@ all_op ::=
| 'NOT_REGIMATCH'
| 'AND_AND'
| 'AT_AT'
| 'DISTANCE'
| 'COS_DISTANCE'
| 'NEG_INNER_PRODUCT'
| '~'
| 'SQRT'
| 'CBRT'
Expand Down Expand Up @@ -3397,6 +3401,7 @@ const_typename ::=
| character_with_length
| const_datetime
| const_geo
| const_vector

interval_type ::=
'INTERVAL'
Expand Down Expand Up @@ -4159,6 +4164,7 @@ bare_label_keywords ::=
| 'VARCHAR'
| 'VARIABLES'
| 'VARIADIC'
| 'VECTOR'
| 'VERIFY_BACKUP_TABLE_DATA'
| 'VIEW'
| 'VIEWACTIVITY'
Expand Down Expand Up @@ -4312,6 +4318,10 @@ const_geo ::=
| 'GEOMETRY' '(' geo_shape_type ',' signed_iconst ')'
| 'GEOGRAPHY' '(' geo_shape_type ',' signed_iconst ')'

const_vector ::=
'VECTOR'
| 'VECTOR' '(' iconst32 ')'

interval_qualifier ::=
'YEAR'
| 'MONTH'
Expand Down
19 changes: 19 additions & 0 deletions docs/generated/sql/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -1279,6 +1279,25 @@ the locality flag on node startup. Returns an error if no region is set.</p>
</span></td><td>Stable</td></tr></tbody>
</table>

### PGVector functions

<table>
<thead><tr><th>Function &rarr; Returns</th><th>Description</th><th>Volatility</th></tr></thead>
<tbody>
<tr><td><a name="cosine_distance"></a><code>cosine_distance(v1: vector, v2: vector) &rarr; <a href="float.html">float</a></code></td><td><span class="funcdesc"><p>Returns the cosine distance between the two vectors.</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="inner_product"></a><code>inner_product(v1: vector, v2: vector) &rarr; <a href="float.html">float</a></code></td><td><span class="funcdesc"><p>Returns the inner product between the two vectors.</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="l1_distance"></a><code>l1_distance(v1: vector, v2: vector) &rarr; <a href="float.html">float</a></code></td><td><span class="funcdesc"><p>Returns the Manhattan distance between the two vectors.</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="l2_distance"></a><code>l2_distance(v1: vector, v2: vector) &rarr; <a href="float.html">float</a></code></td><td><span class="funcdesc"><p>Returns the Euclidean distance between the two vectors.</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="vector_dims"></a><code>vector_dims(vector: vector) &rarr; <a href="int.html">int</a></code></td><td><span class="funcdesc"><p>Returns the number of the dimensions in the vector.</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="vector_norm"></a><code>vector_norm(vector: vector) &rarr; <a href="float.html">float</a></code></td><td><span class="funcdesc"><p>Returns the Euclidean norm of the vector.</p>
</span></td><td>Immutable</td></tr></tbody>
</table>

### STRING[] functions

<table>
Expand Down
23 changes: 23 additions & 0 deletions docs/generated/sql/operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
<tr><td><a href="interval.html">interval</a> <code>*</code> <a href="decimal.html">decimal</a></td><td><a href="interval.html">interval</a></td></tr>
<tr><td><a href="interval.html">interval</a> <code>*</code> <a href="float.html">float</a></td><td><a href="interval.html">interval</a></td></tr>
<tr><td><a href="interval.html">interval</a> <code>*</code> <a href="int.html">int</a></td><td><a href="interval.html">interval</a></td></tr>
<tr><td>vector <code>*</code> vector</td><td>vector</td></tr>
</tbody></table>
<table><thead>
<tr><td><code>+</code></td><td>Return</td></tr>
Expand Down Expand Up @@ -89,6 +90,7 @@
<tr><td><a href="timestamp.html">timestamptz</a> <code>+</code> <a href="interval.html">interval</a></td><td><a href="timestamp.html">timestamptz</a></td></tr>
<tr><td>timetz <code>+</code> <a href="date.html">date</a></td><td><a href="timestamp.html">timestamptz</a></td></tr>
<tr><td>timetz <code>+</code> <a href="interval.html">interval</a></td><td>timetz</td></tr>
<tr><td>vector <code>+</code> vector</td><td>vector</td></tr>
</tbody></table>
<table><thead>
<tr><td><code>-</code></td><td>Return</td></tr>
Expand Down Expand Up @@ -123,6 +125,7 @@
<tr><td><a href="timestamp.html">timestamptz</a> <code>-</code> <a href="timestamp.html">timestamp</a></td><td><a href="interval.html">interval</a></td></tr>
<tr><td><a href="timestamp.html">timestamptz</a> <code>-</code> <a href="timestamp.html">timestamptz</a></td><td><a href="interval.html">interval</a></td></tr>
<tr><td>timetz <code>-</code> <a href="interval.html">interval</a></td><td>timetz</td></tr>
<tr><td>vector <code>-</code> vector</td><td>vector</td></tr>
</tbody></table>
<table><thead>
<tr><td><code>-></code></td><td>Return</td></tr>
Expand Down Expand Up @@ -213,6 +216,17 @@
<tr><td><a href="uuid.html">uuid</a> <code><</code> <a href="uuid.html">uuid</a></td><td><a href="bool.html">bool</a></td></tr>
<tr><td><a href="uuid.html">uuid[]</a> <code><</code> <a href="uuid.html">uuid[]</a></td><td><a href="bool.html">bool</a></td></tr>
<tr><td>varbit <code><</code> varbit</td><td><a href="bool.html">bool</a></td></tr>
<tr><td>vector <code><</code> vector</td><td><a href="bool.html">bool</a></td></tr>
</tbody></table>
<table><thead>
<tr><td><code><#></code></td><td>Return</td></tr>
</thead><tbody>
<tr><td>vector <code><#></code> vector</td><td><a href="float.html">float</a></td></tr>
</tbody></table>
<table><thead>
<tr><td><code><-></code></td><td>Return</td></tr>
</thead><tbody>
<tr><td>vector <code><-></code> vector</td><td><a href="float.html">float</a></td></tr>
</tbody></table>
<table><thead>
<tr><td><code><<</code></td><td>Return</td></tr>
Expand Down Expand Up @@ -278,6 +292,12 @@
<tr><td><a href="uuid.html">uuid</a> <code><=</code> <a href="uuid.html">uuid</a></td><td><a href="bool.html">bool</a></td></tr>
<tr><td><a href="uuid.html">uuid[]</a> <code><=</code> <a href="uuid.html">uuid[]</a></td><td><a href="bool.html">bool</a></td></tr>
<tr><td>varbit <code><=</code> varbit</td><td><a href="bool.html">bool</a></td></tr>
<tr><td>vector <code><=</code> vector</td><td><a href="bool.html">bool</a></td></tr>
</tbody></table>
<table><thead>
<tr><td><code><=></code></td><td>Return</td></tr>
</thead><tbody>
<tr><td>vector <code><=></code> vector</td><td><a href="float.html">float</a></td></tr>
</tbody></table>
<table><thead>
<tr><td><code><@</code></td><td>Return</td></tr>
Expand Down Expand Up @@ -344,6 +364,7 @@
<tr><td><a href="uuid.html">uuid</a> <code>=</code> <a href="uuid.html">uuid</a></td><td><a href="bool.html">bool</a></td></tr>
<tr><td><a href="uuid.html">uuid[]</a> <code>=</code> <a href="uuid.html">uuid[]</a></td><td><a href="bool.html">bool</a></td></tr>
<tr><td>varbit <code>=</code> varbit</td><td><a href="bool.html">bool</a></td></tr>
<tr><td>vector <code>=</code> vector</td><td><a href="bool.html">bool</a></td></tr>
</tbody></table>
<table><thead>
<tr><td><code>>></code></td><td>Return</td></tr>
Expand Down Expand Up @@ -412,6 +433,7 @@
<tr><td>tuple <code>IN</code> tuple</td><td><a href="bool.html">bool</a></td></tr>
<tr><td><a href="uuid.html">uuid</a> <code>IN</code> tuple</td><td><a href="bool.html">bool</a></td></tr>
<tr><td>varbit <code>IN</code> tuple</td><td><a href="bool.html">bool</a></td></tr>
<tr><td>vector <code>IN</code> tuple</td><td><a href="bool.html">bool</a></td></tr>
</tbody></table>
<table><thead>
<tr><td><code>IS NOT DISTINCT FROM</code></td><td>Return</td></tr>
Expand Down Expand Up @@ -475,6 +497,7 @@
<tr><td><a href="uuid.html">uuid</a> <code>IS NOT DISTINCT FROM</code> <a href="uuid.html">uuid</a></td><td><a href="bool.html">bool</a></td></tr>
<tr><td><a href="uuid.html">uuid[]</a> <code>IS NOT DISTINCT FROM</code> <a href="uuid.html">uuid[]</a></td><td><a href="bool.html">bool</a></td></tr>
<tr><td>varbit <code>IS NOT DISTINCT FROM</code> varbit</td><td><a href="bool.html">bool</a></td></tr>
<tr><td>vector <code>IS NOT DISTINCT FROM</code> vector</td><td><a href="bool.html">bool</a></td></tr>
<tr><td>void <code>IS NOT DISTINCT FROM</code> unknown</td><td><a href="bool.html">bool</a></td></tr>
</tbody></table>
<table><thead>
Expand Down
3 changes: 3 additions & 0 deletions pkg/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -749,6 +749,7 @@ ALL_TESTS = [
"//pkg/util/ulid:ulid_test",
"//pkg/util/unique:unique_test",
"//pkg/util/uuid:uuid_test",
"//pkg/util/vector:vector_test",
"//pkg/util/version:version_test",
"//pkg/util:util_test",
"//pkg/workload/bank:bank_test",
Expand Down Expand Up @@ -2583,6 +2584,8 @@ GO_TARGETS = [
"//pkg/util/unique:unique_test",
"//pkg/util/uuid:uuid",
"//pkg/util/uuid:uuid_test",
"//pkg/util/vector:vector",
"//pkg/util/vector:vector_test",
"//pkg/util/version:version",
"//pkg/util/version:version_test",
"//pkg/util:util",
Expand Down
8 changes: 8 additions & 0 deletions pkg/ccl/changefeedccl/avro.go
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,14 @@ func typeToAvroSchema(typ *types.T) (*avroSchemaField, error) {
return tree.NewDFloat(tree.DFloat(x.(float64))), nil
},
)
case types.PGVectorFamily:
setNullable(
avroSchemaString,
func(d tree.Datum, _ interface{}) (interface{}, error) {
return d.(*tree.DPGVector).String(), nil
},
func(x interface{}) (tree.Datum, error) { return tree.ParseDPGVector(x.(string)) },
)
case types.PGLSNFamily:
setNullable(
avroSchemaString,
Expand Down
2 changes: 1 addition & 1 deletion pkg/ccl/changefeedccl/encoder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1167,7 +1167,7 @@ func TestJsonRountrip(t *testing.T) {
switch typ {
case types.Jsonb:
// Unsupported by sql/catalog/colinfo
case types.TSQuery, types.TSVector:
case types.TSQuery, types.TSVector, types.PGVector:
// Unsupported by pkg/sql/parser
default:
if arrayTyp.InternalType.ArrayContents == typ {
Expand Down
7 changes: 7 additions & 0 deletions pkg/ccl/logictestccl/tests/3node-tenant/generated_test.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions pkg/sql/catalog/colinfo/col_type_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,14 @@ func ValidateColumnDefType(ctx context.Context, version clusterversion.Handle, t
)
}

case types.PGVectorFamily:
if !version.IsActive(ctx, clusterversion.V24_2) {
return pgerror.Newf(
pgcode.FeatureNotSupported,
"pg_vector not supported until version 24.2",
)
}

case types.RefCursorFamily:
if !version.IsActive(ctx, clusterversion.V23_2) {
return pgerror.Newf(
Expand Down Expand Up @@ -213,6 +221,8 @@ func MustBeValueEncoded(semanticType *types.T) bool {
return true
case types.TSVectorFamily, types.TSQueryFamily:
return true
case types.PGVectorFamily:
return true
}
return false
}
1 change: 1 addition & 0 deletions pkg/sql/catalog/colinfo/column_type_properties.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ func CanHaveCompositeKeyEncoding(typ *types.T) bool {
types.EnumFamily,
types.Box2DFamily,
types.PGLSNFamily,
types.PGVectorFamily,
types.RefCursorFamily,
types.VoidFamily,
types.EncodedKeyFamily,
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/exec_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -2050,6 +2050,7 @@ func checkResultType(typ *types.T, fmtCode pgwirebase.FormatCode) error {
case types.INetFamily:
case types.OidFamily:
case types.PGLSNFamily:
case types.PGVectorFamily:
case types.RefCursorFamily:
case types.TupleFamily:
case types.EnumFamily:
Expand Down
Loading

0 comments on commit 7661327

Please sign in to comment.