Skip to content

Commit

Permalink
Merge branch '8.x' into fix/analysis-tokenizer-edgeNGram
Browse files Browse the repository at this point in the history
  • Loading branch information
elasticmachine authored Sep 26, 2024
2 parents cedec13 + 774a97f commit 8d35a71
Show file tree
Hide file tree
Showing 384 changed files with 10,586 additions and 2,810 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,15 @@
import org.elasticsearch.compute.operator.EvalOperator;
import org.elasticsearch.compute.operator.Operator;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.xpack.esql.core.expression.Expression;
import org.elasticsearch.xpack.esql.core.expression.FieldAttribute;
import org.elasticsearch.xpack.esql.core.expression.Literal;
import org.elasticsearch.xpack.esql.core.expression.predicate.regex.RLikePattern;
import org.elasticsearch.xpack.esql.core.tree.Source;
import org.elasticsearch.xpack.esql.core.type.DataType;
import org.elasticsearch.xpack.esql.core.type.EsField;
import org.elasticsearch.xpack.esql.evaluator.EvalMapper;
import org.elasticsearch.xpack.esql.expression.function.scalar.conditional.Case;
import org.elasticsearch.xpack.esql.expression.function.scalar.date.DateTrunc;
import org.elasticsearch.xpack.esql.expression.function.scalar.math.Abs;
import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvMin;
Expand All @@ -53,6 +55,7 @@

import java.time.Duration;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;

Expand Down Expand Up @@ -91,6 +94,8 @@ public class EvalBenchmark {
"abs",
"add",
"add_double",
"case_1_eager",
"case_1_lazy",
"date_trunc",
"equal_to_const",
"long_equal_to_long",
Expand Down Expand Up @@ -125,6 +130,18 @@ private static EvalOperator.ExpressionEvaluator evaluator(String operation) {
layout(doubleField)
).get(driverContext);
}
case "case_1_eager", "case_1_lazy" -> {
FieldAttribute f1 = longField();
FieldAttribute f2 = longField();
Expression condition = new Equals(Source.EMPTY, f1, new Literal(Source.EMPTY, 1L, DataType.LONG));
Expression lhs = f1;
Expression rhs = f2;
if (operation.endsWith("lazy")) {
lhs = new Add(Source.EMPTY, lhs, new Literal(Source.EMPTY, 1L, DataType.LONG));
rhs = new Add(Source.EMPTY, rhs, new Literal(Source.EMPTY, 1L, DataType.LONG));
}
yield EvalMapper.toEvaluator(new Case(Source.EMPTY, condition, List.of(lhs, rhs)), layout(f1, f2)).get(driverContext);
}
case "date_trunc" -> {
FieldAttribute timestamp = new FieldAttribute(
Source.EMPTY,
Expand Down Expand Up @@ -216,6 +233,28 @@ private static void checkExpected(String operation, Page actual) {
}
}
}
case "case_1_eager" -> {
LongVector f1 = actual.<LongBlock>getBlock(0).asVector();
LongVector f2 = actual.<LongBlock>getBlock(1).asVector();
LongVector result = actual.<LongBlock>getBlock(2).asVector();
for (int i = 0; i < BLOCK_LENGTH; i++) {
long expected = f1.getLong(i) == 1 ? f1.getLong(i) : f2.getLong(i);
if (result.getLong(i) != expected) {
throw new AssertionError("[" + operation + "] expected [" + expected + "] but was [" + result.getLong(i) + "]");
}
}
}
case "case_1_lazy" -> {
LongVector f1 = actual.<LongBlock>getBlock(0).asVector();
LongVector f2 = actual.<LongBlock>getBlock(1).asVector();
LongVector result = actual.<LongBlock>getBlock(2).asVector();
for (int i = 0; i < BLOCK_LENGTH; i++) {
long expected = 1 + (f1.getLong(i) == 1 ? f1.getLong(i) : f2.getLong(i));
if (result.getLong(i) != expected) {
throw new AssertionError("[" + operation + "] expected [" + expected + "] but was [" + result.getLong(i) + "]");
}
}
}
case "date_trunc" -> {
LongVector v = actual.<LongBlock>getBlock(1).asVector();
long oneDay = TimeValue.timeValueHours(24).millis();
Expand Down Expand Up @@ -280,6 +319,15 @@ private static Page page(String operation) {
}
yield new Page(builder.build());
}
case "case_1_eager", "case_1_lazy" -> {
var f1 = blockFactory.newLongBlockBuilder(BLOCK_LENGTH);
var f2 = blockFactory.newLongBlockBuilder(BLOCK_LENGTH);
for (int i = 0; i < BLOCK_LENGTH; i++) {
f1.appendLong(i);
f2.appendLong(-i);
}
yield new Page(f1.build(), f2.build());
}
case "long_equal_to_long" -> {
var lhs = blockFactory.newLongBlockBuilder(BLOCK_LENGTH);
var rhs = blockFactory.newLongBlockBuilder(BLOCK_LENGTH);
Expand Down
5 changes: 5 additions & 0 deletions docs/changelog/111684.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 111684
summary: Write downloaded model parts async
area: Machine Learning
type: enhancement
issues: []
5 changes: 5 additions & 0 deletions docs/changelog/112295.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 112295
summary: "ESQL: Speed up CASE for some parameters"
area: ES|QL
type: enhancement
issues: []
6 changes: 6 additions & 0 deletions docs/changelog/112405.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 112405
summary: Improve date expression/remote handling in index names
area: Search
type: bug
issues:
- 112243
6 changes: 6 additions & 0 deletions docs/changelog/112723.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 112723
summary: Improve DateTime error handling and add some bad date tests
area: Search
type: bug
issues:
- 112190
5 changes: 5 additions & 0 deletions docs/changelog/112768.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 112768
summary: Deduplicate Kuromoji User Dictionary
area: Search
type: enhancement
issues: []
5 changes: 5 additions & 0 deletions docs/changelog/112895.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 112895
summary: (logger) change from error to warn for short circuiting user
area: Security
type: enhancement
issues: []
6 changes: 6 additions & 0 deletions docs/changelog/112972.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 112972
summary: "ILM: Add `total_shards_per_node` setting to searchable snapshot"
area: ILM+SLM
type: enhancement
issues:
- 112261
5 changes: 5 additions & 0 deletions docs/changelog/113013.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 113013
summary: Account for `DelayedBucket` before reduction
area: Aggregations
type: enhancement
issues: []
5 changes: 5 additions & 0 deletions docs/changelog/113051.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 113051
summary: Add Search Inference ID To Semantic Text Mapping
area: Mapping
type: enhancement
issues: []
5 changes: 5 additions & 0 deletions docs/changelog/113158.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 113158
summary: Adds a new Inference API for streaming responses back to the user.
area: Machine Learning
type: enhancement
issues: []
6 changes: 6 additions & 0 deletions docs/changelog/113183.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 113183
summary: "ESQL: TOP support for strings"
area: ES|QL
type: feature
issues:
- 109849
5 changes: 5 additions & 0 deletions docs/changelog/113276.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 113276
summary: Adding component template substitutions to the simulate ingest API
area: Ingest Node
type: enhancement
issues: []
6 changes: 6 additions & 0 deletions docs/changelog/113373.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 113373
summary: Implement `parseBytesRef` for `TimeSeriesRoutingHashFieldType`
area: TSDB
type: bug
issues:
- 112399
5 changes: 5 additions & 0 deletions docs/changelog/113385.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 113385
summary: Small performance improvement in h3 library
area: Geo
type: enhancement
issues: []
6 changes: 6 additions & 0 deletions docs/changelog/113499.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 113499
summary: Fix synthetic source for flattened field when used with `ignore_above`
area: Logs
type: bug
issues:
- 112044
4 changes: 2 additions & 2 deletions docs/internal/DistributedArchitectureGuide.md
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ changes. The cloud service will add more resources to the cluster based on Elast
Elasticsearch by itself cannot automatically scale.

Autoscaling recommendations are tailored for the user [based on user defined policies][], composed of data
roles (hot, frozen, etc) and [deciders][]. There's a public [webinar on autoscaling][], as well as the
roles (hot, frozen, etc.) and [deciders][]. There's a public [webinar on autoscaling][], as well as the
public [Autoscaling APIs] docs.

Autoscaling's current implementation is based primary on storage requirements, as well as memory capacity
Expand Down Expand Up @@ -332,7 +332,7 @@ problems in the cluster. It uses [an algorithm defined here][]. Some examples ar
[an algorithm defined here]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/storage/ReactiveStorageDeciderService.java#L158-L176

The `ProactiveStorageDeciderService` maintains a forecast window that [defaults to 30 minutes][]. It only
runs on data streams (ILM, rollover, etc), not regular indexes. It looks at past [index changes][] that
runs on data streams (ILM, rollover, etc.), not regular indexes. It looks at past [index changes][] that
took place within the forecast window to [predict][] resources that will be needed shortly.

[defaults to 30 minutes]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/storage/ProactiveStorageDeciderService.java#L32
Expand Down
4 changes: 2 additions & 2 deletions docs/plugins/analysis-icu.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ GET /my-index-000001/_search <3>
--------------------------

<1> The `name` field uses the `standard` analyzer, and so support full text queries.
<1> The `name` field uses the `standard` analyzer, and so supports full text queries.
<2> The `name.sort` field is an `icu_collation_keyword` field that will preserve the name as
a single token doc values, and applies the German ``phonebook'' order.
<3> An example query which searches the `name` field and sorts on the `name.sort` field.
Expand Down Expand Up @@ -467,7 +467,7 @@ differences.
`case_first`::

Possible values: `lower` or `upper`. Useful to control which case is sorted
first when case is not ignored for strength `tertiary`. The default depends on
first when the case is not ignored for strength `tertiary`. The default depends on
the collation.

`numeric`::
Expand Down
12 changes: 9 additions & 3 deletions docs/plugins/analysis-kuromoji.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ The `kuromoji_iteration_mark` normalizes Japanese horizontal iteration marks

`normalize_kanji`::

Indicates whether kanji iteration marks should be normalize. Defaults to `true`.
Indicates whether kanji iteration marks should be normalized. Defaults to `true`.

`normalize_kana`::

Expand Down Expand Up @@ -133,6 +133,11 @@ unknown words. It can be set to:

Whether punctuation should be discarded from the output. Defaults to `true`.

`lenient`::

Whether the `user_dictionary` should be deduplicated on the provided `text`.
False by default causing duplicates to generate an error.

`user_dictionary`::
+
--
Expand Down Expand Up @@ -189,7 +194,7 @@ PUT kuromoji_sample
+
--
Additional expert user parameters `nbest_cost` and `nbest_examples` can be used
to include additional tokens that most likely according to the statistical model.
to include additional tokens that are most likely according to the statistical model.
If both parameters are used, the largest number of both is applied.

`nbest_cost`::
Expand Down Expand Up @@ -221,7 +226,8 @@ PUT kuromoji_sample
"type": "kuromoji_tokenizer",
"mode": "extended",
"discard_punctuation": "false",
"user_dictionary": "userdict_ja.txt"
"user_dictionary": "userdict_ja.txt",
"lenient": "true"
}
},
"analyzer": {
Expand Down
11 changes: 8 additions & 3 deletions docs/plugins/analysis-nori.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ It can be set to:

Whether punctuation should be discarded from the output. Defaults to `true`.

`lenient`::

Whether the `user_dictionary` should be deduplicated on the provided `text`.
False by default causing duplicates to generate an error.

`user_dictionary`::
+
--
Expand Down Expand Up @@ -104,7 +109,8 @@ PUT nori_sample
"type": "nori_tokenizer",
"decompound_mode": "mixed",
"discard_punctuation": "false",
"user_dictionary": "userdict_ko.txt"
"user_dictionary": "userdict_ko.txt",
"lenient": "true"
}
},
"analyzer": {
Expand Down Expand Up @@ -299,7 +305,6 @@ Which responds with:
}
--------------------------------------------------


[[analysis-nori-speech]]
==== `nori_part_of_speech` token filter

Expand Down Expand Up @@ -447,7 +452,7 @@ Which responds with:
The `nori_number` token filter normalizes Korean numbers
to regular Arabic decimal numbers in half-width characters.

Korean numbers are often written using a combination of Hangul and Arabic numbers with various kinds punctuation.
Korean numbers are often written using a combination of Hangul and Arabic numbers with various kinds of punctuation.
For example, 3.2천 means 3200.
This filter does this kind of normalization and allows a search for 3200 to match 3.2천 in text,
but can also be used to make range facets based on the normalized numbers and so on.
Expand Down
2 changes: 1 addition & 1 deletion docs/plugins/development/creating-classic-plugins.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ will refuse to start in the presence of plugins with the incorrect
[discrete]
==== Classic plugin file structure

Classis plugins are ZIP files composed of JAR files and
Classic plugins are ZIP files composed of JAR files and
<<plugin-descriptor-file-{plugin-type},a metadata file called
`plugin-descriptor.properties`>>, a Java properties file that describes the
plugin.
Expand Down
Loading

0 comments on commit 8d35a71

Please sign in to comment.