Merge branch '8.x' into fix/analysis-tokenizer-edgeNGram

YeonghyeonKO · Sep 26, 2024 · 8d35a71 · 8d35a71
2 parents cedec13 + 774a97f
commit 8d35a71
Show file tree

Hide file tree

Showing 384 changed files with 10,586 additions and 2,810 deletions.
diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/compute/operator/EvalBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/compute/operator/EvalBenchmark.java
@@ -25,13 +25,15 @@
 import org.elasticsearch.compute.operator.EvalOperator;
 import org.elasticsearch.compute.operator.Operator;
 import org.elasticsearch.core.TimeValue;
+import org.elasticsearch.xpack.esql.core.expression.Expression;
 import org.elasticsearch.xpack.esql.core.expression.FieldAttribute;
 import org.elasticsearch.xpack.esql.core.expression.Literal;
 import org.elasticsearch.xpack.esql.core.expression.predicate.regex.RLikePattern;
 import org.elasticsearch.xpack.esql.core.tree.Source;
 import org.elasticsearch.xpack.esql.core.type.DataType;
 import org.elasticsearch.xpack.esql.core.type.EsField;
 import org.elasticsearch.xpack.esql.evaluator.EvalMapper;
+import org.elasticsearch.xpack.esql.expression.function.scalar.conditional.Case;
 import org.elasticsearch.xpack.esql.expression.function.scalar.date.DateTrunc;
 import org.elasticsearch.xpack.esql.expression.function.scalar.math.Abs;
 import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvMin;
@@ -53,6 +55,7 @@
 
 import java.time.Duration;
 import java.util.Arrays;
+import java.util.List;
 import java.util.Map;
 import java.util.concurrent.TimeUnit;
 
@@ -91,6 +94,8 @@ public class EvalBenchmark {
             "abs",
             "add",
             "add_double",
+            "case_1_eager",
+            "case_1_lazy",
             "date_trunc",
             "equal_to_const",
             "long_equal_to_long",
@@ -125,6 +130,18 @@ private static EvalOperator.ExpressionEvaluator evaluator(String operation) {
                     layout(doubleField)
                 ).get(driverContext);
             }
+            case "case_1_eager", "case_1_lazy" -> {
+                FieldAttribute f1 = longField();
+                FieldAttribute f2 = longField();
+                Expression condition = new Equals(Source.EMPTY, f1, new Literal(Source.EMPTY, 1L, DataType.LONG));
+                Expression lhs = f1;
+                Expression rhs = f2;
+                if (operation.endsWith("lazy")) {
+                    lhs = new Add(Source.EMPTY, lhs, new Literal(Source.EMPTY, 1L, DataType.LONG));
+                    rhs = new Add(Source.EMPTY, rhs, new Literal(Source.EMPTY, 1L, DataType.LONG));
+                }
+                yield EvalMapper.toEvaluator(new Case(Source.EMPTY, condition, List.of(lhs, rhs)), layout(f1, f2)).get(driverContext);
+            }
             case "date_trunc" -> {
                 FieldAttribute timestamp = new FieldAttribute(
                     Source.EMPTY,
@@ -216,6 +233,28 @@ private static void checkExpected(String operation, Page actual) {
                     }
                 }
             }
+            case "case_1_eager" -> {
+                LongVector f1 = actual.<LongBlock>getBlock(0).asVector();
+                LongVector f2 = actual.<LongBlock>getBlock(1).asVector();
+                LongVector result = actual.<LongBlock>getBlock(2).asVector();
+                for (int i = 0; i < BLOCK_LENGTH; i++) {
+                    long expected = f1.getLong(i) == 1 ? f1.getLong(i) : f2.getLong(i);
+                    if (result.getLong(i) != expected) {
+                        throw new AssertionError("[" + operation + "] expected [" + expected + "] but was [" + result.getLong(i) + "]");
+                    }
+                }
+            }
+            case "case_1_lazy" -> {
+                LongVector f1 = actual.<LongBlock>getBlock(0).asVector();
+                LongVector f2 = actual.<LongBlock>getBlock(1).asVector();
+                LongVector result = actual.<LongBlock>getBlock(2).asVector();
+                for (int i = 0; i < BLOCK_LENGTH; i++) {
+                    long expected = 1 + (f1.getLong(i) == 1 ? f1.getLong(i) : f2.getLong(i));
+                    if (result.getLong(i) != expected) {
+                        throw new AssertionError("[" + operation + "] expected [" + expected + "] but was [" + result.getLong(i) + "]");
+                    }
+                }
+            }
             case "date_trunc" -> {
                 LongVector v = actual.<LongBlock>getBlock(1).asVector();
                 long oneDay = TimeValue.timeValueHours(24).millis();
@@ -280,6 +319,15 @@ private static Page page(String operation) {
                 }
                 yield new Page(builder.build());
             }
+            case "case_1_eager", "case_1_lazy" -> {
+                var f1 = blockFactory.newLongBlockBuilder(BLOCK_LENGTH);
+                var f2 = blockFactory.newLongBlockBuilder(BLOCK_LENGTH);
+                for (int i = 0; i < BLOCK_LENGTH; i++) {
+                    f1.appendLong(i);
+                    f2.appendLong(-i);
+                }
+                yield new Page(f1.build(), f2.build());
+            }
             case "long_equal_to_long" -> {
                 var lhs = blockFactory.newLongBlockBuilder(BLOCK_LENGTH);
                 var rhs = blockFactory.newLongBlockBuilder(BLOCK_LENGTH);

diff --git a/docs/changelog/111684.yaml b/docs/changelog/111684.yaml
@@ -0,0 +1,5 @@
+pr: 111684
+summary: Write downloaded model parts async
+area: Machine Learning
+type: enhancement
+issues: []
diff --git a/docs/changelog/112295.yaml b/docs/changelog/112295.yaml
@@ -0,0 +1,5 @@
+pr: 112295
+summary: "ESQL: Speed up CASE for some parameters"
+area: ES|QL
+type: enhancement
+issues: []
diff --git a/docs/changelog/112405.yaml b/docs/changelog/112405.yaml
@@ -0,0 +1,6 @@
+pr: 112405
+summary: Improve date expression/remote handling in index names
+area: Search
+type: bug
+issues:
+ - 112243
diff --git a/docs/changelog/112723.yaml b/docs/changelog/112723.yaml
@@ -0,0 +1,6 @@
+pr: 112723
+summary: Improve DateTime error handling and add some bad date tests
+area: Search
+type: bug
+issues:
+ - 112190
diff --git a/docs/changelog/112768.yaml b/docs/changelog/112768.yaml
@@ -0,0 +1,5 @@
+pr: 112768
+summary: Deduplicate Kuromoji User Dictionary
+area: Search
+type: enhancement
+issues: []
diff --git a/docs/changelog/112895.yaml b/docs/changelog/112895.yaml
@@ -0,0 +1,5 @@
+pr: 112895
+summary: (logger) change from error to warn for short circuiting user
+area: Security
+type: enhancement
+issues: []
diff --git a/docs/changelog/112972.yaml b/docs/changelog/112972.yaml
@@ -0,0 +1,6 @@
+pr: 112972
+summary: "ILM: Add `total_shards_per_node` setting to searchable snapshot"
+area: ILM+SLM
+type: enhancement
+issues:
+ - 112261
diff --git a/docs/changelog/113013.yaml b/docs/changelog/113013.yaml
@@ -0,0 +1,5 @@
+pr: 113013
+summary: Account for `DelayedBucket` before reduction
+area: Aggregations
+type: enhancement
+issues: []
diff --git a/docs/changelog/113051.yaml b/docs/changelog/113051.yaml
@@ -0,0 +1,5 @@
+pr: 113051
+summary: Add Search Inference ID To Semantic Text Mapping
+area: Mapping
+type: enhancement
+issues: []
diff --git a/docs/changelog/113158.yaml b/docs/changelog/113158.yaml
@@ -0,0 +1,5 @@
+pr: 113158
+summary: Adds a new Inference API for streaming responses back to the user.
+area: Machine Learning
+type: enhancement
+issues: []
diff --git a/docs/changelog/113183.yaml b/docs/changelog/113183.yaml
@@ -0,0 +1,6 @@
+pr: 113183
+summary: "ESQL: TOP support for strings"
+area: ES|QL
+type: feature
+issues:
+ - 109849
diff --git a/docs/changelog/113276.yaml b/docs/changelog/113276.yaml
@@ -0,0 +1,5 @@
+pr: 113276
+summary: Adding component template substitutions to the simulate ingest API
+area: Ingest Node
+type: enhancement
+issues: []
diff --git a/docs/changelog/113373.yaml b/docs/changelog/113373.yaml
@@ -0,0 +1,6 @@
+pr: 113373
+summary: Implement `parseBytesRef` for `TimeSeriesRoutingHashFieldType`
+area: TSDB
+type: bug
+issues:
+ - 112399
diff --git a/docs/changelog/113385.yaml b/docs/changelog/113385.yaml
@@ -0,0 +1,5 @@
+pr: 113385
+summary: Small performance improvement in h3 library
+area: Geo
+type: enhancement
+issues: []
diff --git a/docs/changelog/113499.yaml b/docs/changelog/113499.yaml
@@ -0,0 +1,6 @@
+pr: 113499
+summary: Fix synthetic source for flattened field when used with `ignore_above`
+area: Logs
+type: bug
+issues:
+ - 112044
diff --git a/docs/internal/DistributedArchitectureGuide.md b/docs/internal/DistributedArchitectureGuide.md
@@ -252,7 +252,7 @@ changes. The cloud service will add more resources to the cluster based on Elast
 Elasticsearch by itself cannot automatically scale.
 
 Autoscaling recommendations are tailored for the user [based on user defined policies][], composed of data
-roles (hot, frozen, etc) and [deciders][]. There's a public [webinar on autoscaling][], as well as the
+roles (hot, frozen, etc.) and [deciders][]. There's a public [webinar on autoscaling][], as well as the
 public [Autoscaling APIs] docs.
 
 Autoscaling's current implementation is based primary on storage requirements, as well as memory capacity
@@ -332,7 +332,7 @@ problems in the cluster. It uses [an algorithm defined here][]. Some examples ar
 [an algorithm defined here]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/storage/ReactiveStorageDeciderService.java#L158-L176
 
 The `ProactiveStorageDeciderService` maintains a forecast window that [defaults to 30 minutes][]. It only
-runs on data streams (ILM, rollover, etc), not regular indexes. It looks at past [index changes][] that
+runs on data streams (ILM, rollover, etc.), not regular indexes. It looks at past [index changes][] that
 took place within the forecast window to [predict][] resources that will be needed shortly.
 
 [defaults to 30 minutes]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/storage/ProactiveStorageDeciderService.java#L32

diff --git a/docs/plugins/analysis-icu.asciidoc b/docs/plugins/analysis-icu.asciidoc
@@ -380,7 +380,7 @@ GET /my-index-000001/_search <3>
 
 --------------------------
 
-<1> The `name` field uses the `standard` analyzer, and so support full text queries.
+<1> The `name` field uses the `standard` analyzer, and so supports full text queries.
 <2> The `name.sort` field is an `icu_collation_keyword` field that will preserve the name as
     a single token doc values, and applies the German ``phonebook'' order.
 <3> An example query which searches the `name` field and sorts on the `name.sort` field.
@@ -467,7 +467,7 @@ differences.
 `case_first`::
 
 Possible values: `lower` or `upper`. Useful to control which case is sorted
-first when case is not ignored for strength `tertiary`. The default depends on
+first when the case is not ignored for strength `tertiary`. The default depends on
 the collation.
 
 `numeric`::

diff --git a/docs/plugins/analysis-kuromoji.asciidoc b/docs/plugins/analysis-kuromoji.asciidoc
@@ -86,7 +86,7 @@ The `kuromoji_iteration_mark` normalizes Japanese horizontal iteration marks
 
 `normalize_kanji`::
 
-    Indicates whether kanji iteration marks should be normalize. Defaults to `true`.
+    Indicates whether kanji iteration marks should be normalized. Defaults to `true`.
 
 `normalize_kana`::
 
@@ -133,6 +133,11 @@ unknown words. It can be set to:
 
     Whether punctuation should be discarded from the output. Defaults to `true`.
 
+`lenient`::
+
+    Whether the `user_dictionary` should be deduplicated on the provided `text`.
+    False by default causing duplicates to generate an error.
+
 `user_dictionary`::
 +
 --
@@ -189,7 +194,7 @@ PUT kuromoji_sample
 +
 --
 Additional expert user parameters `nbest_cost` and `nbest_examples` can be used
-to include additional tokens that most likely according to the statistical model.
+to include additional tokens that are most likely according to the statistical model.
 If both parameters are used, the largest number of both is applied.
 
 `nbest_cost`::
@@ -221,7 +226,8 @@ PUT kuromoji_sample
             "type": "kuromoji_tokenizer",
             "mode": "extended",
             "discard_punctuation": "false",
-            "user_dictionary": "userdict_ja.txt"
+            "user_dictionary": "userdict_ja.txt",
+            "lenient": "true"
           }
         },
         "analyzer": {

diff --git a/docs/plugins/analysis-nori.asciidoc b/docs/plugins/analysis-nori.asciidoc
@@ -58,6 +58,11 @@ It can be set to:
 
     Whether punctuation should be discarded from the output. Defaults to `true`.
 
+`lenient`::
+
+    Whether the `user_dictionary` should be deduplicated on the provided `text`.
+    False by default causing duplicates to generate an error.
+
 `user_dictionary`::
 +
 --
@@ -104,7 +109,8 @@ PUT nori_sample
             "type": "nori_tokenizer",
             "decompound_mode": "mixed",
             "discard_punctuation": "false",
-            "user_dictionary": "userdict_ko.txt"
+            "user_dictionary": "userdict_ko.txt",
+            "lenient": "true"
           }
         },
         "analyzer": {
@@ -299,7 +305,6 @@ Which responds with:
 }
 --------------------------------------------------
 
-
 [[analysis-nori-speech]]
 ==== `nori_part_of_speech` token filter
 
@@ -447,7 +452,7 @@ Which responds with:
 The `nori_number` token filter normalizes Korean numbers
 to regular Arabic decimal numbers in half-width characters.
 
-Korean numbers are often written using a combination of Hangul and Arabic numbers with various kinds punctuation.
+Korean numbers are often written using a combination of Hangul and Arabic numbers with various kinds of punctuation.
 For example, ３．２천 means 3200.
 This filter does this kind of normalization and allows a search for 3200 to match ３．２천 in text,
 but can also be used to make range facets based on the normalized numbers and so on.

diff --git a/docs/plugins/development/creating-classic-plugins.asciidoc b/docs/plugins/development/creating-classic-plugins.asciidoc
@@ -18,7 +18,7 @@ will refuse to start in the presence of plugins with the incorrect
 [discrete]
 ==== Classic plugin file structure
 
-Classis plugins are ZIP files composed of JAR files and
+Classic plugins are ZIP files composed of JAR files and
 <<plugin-descriptor-file-{plugin-type},a metadata file called
 `plugin-descriptor.properties`>>, a Java properties file that describes the
 plugin.