diff --git a/docs/reference/aggregations/bucket/significantterms-aggregation.asciidoc b/docs/reference/aggregations/bucket/significantterms-aggregation.asciidoc index b8e85479d0d83..9615e14d4039c 100644 --- a/docs/reference/aggregations/bucket/significantterms-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/significantterms-aggregation.asciidoc @@ -374,7 +374,7 @@ Chi square behaves like mutual information and can be configured with the same p ===== Google normalized distance -Google normalized distance as described in "The Google Similarity Distance", Cilibrasi and Vitanyi, 2007 (https://arxiv.org/pdf/cs/0412098v3.pdf) can be used as significance score by adding the parameter +Google normalized distance as described in https://arxiv.org/pdf/cs/0412098v3.pdf["The Google Similarity Distance", Cilibrasi and Vitanyi, 2007] can be used as significance score by adding the parameter [source,js] -------------------------------------------------- @@ -408,7 +408,7 @@ Multiple observations are typically required to reinforce a view so it is recomm Roughly, `mutual_information` prefers high frequent terms even if they occur also frequently in the background. For example, in an analysis of natural language text this might lead to selection of stop words. `mutual_information` is unlikely to select very rare terms like misspellings. `gnd` prefers terms with a high co-occurrence and avoids selection of stopwords. It might be better suited for synonym detection. However, `gnd` has a tendency to select very rare terms that are, for example, a result of misspelling. `chi_square` and `jlh` are somewhat in-between. -It is hard to say which one of the different heuristics will be the best choice as it depends on what the significant terms are used for (see for example [Yang and Pedersen, "A Comparative Study on Feature Selection in Text Categorization", 1997](http://courses.ischool.berkeley.edu/i256/f06/papers/yang97comparative.pdf) for a study on using significant terms for feature selection for text classification). +It is hard to say which one of the different heuristics will be the best choice as it depends on what the significant terms are used for (see for example http://courses.ischool.berkeley.edu/i256/f06/papers/yang97comparative.pdf[Yang and Pedersen, "A Comparative Study on Feature Selection in Text Categorization", 1997] for a study on using significant terms for feature selection for text classification). If none of the above measures suits your usecase than another option is to implement a custom significance measure: diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.aggregation/90_sig_text.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.aggregation/90_sig_text.yml deleted file mode 100644 index 673d19e04cf22..0000000000000 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.aggregation/90_sig_text.yml +++ /dev/null @@ -1,151 +0,0 @@ ---- -"Default index": - - - do: - indices.create: - index: goodbad - body: - settings: - number_of_shards: "1" - mappings: - properties: - text: - type: text - fielddata: false - class: - type: keyword - - - do: - index: - index: goodbad - id: 1 - body: { text: "good", class: "good" } - - do: - index: - index: goodbad - id: 2 - body: { text: "good", class: "good" } - - do: - index: - index: goodbad - id: 3 - body: { text: "bad", class: "bad" } - - do: - index: - index: goodbad - id: 4 - body: { text: "bad", class: "bad" } - - do: - index: - index: goodbad - id: 5 - body: { text: "good bad", class: "good" } - - do: - index: - index: goodbad - id: 6 - body: { text: "good bad", class: "bad" } - - do: - index: - index: goodbad - id: 7 - body: { text: "bad", class: "bad" } - - - - - do: - indices.refresh: - index: [goodbad] - - - do: - search: - rest_total_hits_as_int: true - index: goodbad - - - match: {hits.total: 7} - - - do: - search: - rest_total_hits_as_int: true - index: goodbad - body: {"aggs": {"class": {"terms": {"field": "class"},"aggs": {"sig_text": {"significant_text": {"field": "text"}}}}}} - - - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: "bad"} - - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: "good"} - ---- -"Dedup noise": - - - do: - indices.create: - index: goodbad - body: - settings: - number_of_shards: "1" - mappings: - properties: - text: - type: text - fielddata: false - class: - type: keyword - - - do: - index: - index: goodbad - id: 1 - body: { text: "good noisewords1 g1 g2 g3 g4 g5 g6", class: "good" } - - do: - index: - index: goodbad - id: 2 - body: { text: "good noisewords2 g1 g2 g3 g4 g5 g6", class: "good" } - - do: - index: - index: goodbad - id: 3 - body: { text: "bad noisewords3 b1 b2 b3 b4 b5 b6", class: "bad" } - - do: - index: - index: goodbad - id: 4 - body: { text: "bad noisewords4 b1 b2 b3 b4 b5 b6", class: "bad" } - - do: - index: - index: goodbad - id: 5 - body: { text: "good bad noisewords5 gb1 gb2 gb3 gb4 gb5 gb6", class: "good" } - - do: - index: - index: goodbad - id: 6 - body: { text: "good bad noisewords6 gb1 gb2 gb3 gb4 gb5 gb6", class: "bad" } - - do: - index: - index: goodbad - id: 7 - body: { text: "bad noisewords7 b1 b2 b3 b4 b5 b6", class: "bad" } - - - - - do: - indices.refresh: - index: [goodbad] - - - do: - search: - rest_total_hits_as_int: true - index: goodbad - - - match: {hits.total: 7} - - - do: - search: - rest_total_hits_as_int: true - index: goodbad - body: {"aggs": {"class": {"terms": {"field": "class"},"aggs": {"sig_text": {"significant_text": {"field": "text", "filter_duplicate_text": true}}}}}} - - - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: "bad"} - - length: { aggregations.class.buckets.0.sig_text.buckets: 1 } - - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: "good"} - - length: { aggregations.class.buckets.1.sig_text.buckets: 1 } diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.aggregation/90_significant_text.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.aggregation/90_significant_text.yml new file mode 100644 index 0000000000000..074d70d4f6369 --- /dev/null +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.aggregation/90_significant_text.yml @@ -0,0 +1,537 @@ +setup: + - do: + indices.create: + index: goodbad + body: + settings: + number_of_shards: "1" + mappings: + properties: + text: + type: text + fielddata: false + class: + type: keyword + + - do: + bulk: + index: goodbad + refresh: true + body: + - '{ "index": {"_id": "1"} }' + - '{ "text": "good", "class": "good" }' + - '{ "index": {"_id": "2"} }' + - '{ "text": "good", "class": "good" }' + - '{ "index": {"_id": "3"} }' + - '{ "text": "bad", "class": "bad" }' + - '{ "index": {"_id": "4"} }' + - '{ "text": "bad", "class": "bad" }' + - '{ "index": {"_id": "5"} }' + - '{ "text": "good bad", "class": "good" }' + - '{ "index": {"_id": "6"} }' + - '{ "text": "good bad", "class": "bad" }' + - '{ "index": {"_id": "7"} }' + - '{ "text": "bad", "class": "bad" }' + +--- +simple: + - do: + search: + index: goodbad + rest_total_hits_as_int: true + body: + aggs: + class: + terms: + field: class + aggs: + sig_text: + significant_text: + field: text + - match: {hits.total: 7} + - length: {aggregations.class.buckets: 2} + - match: {aggregations.class.buckets.0.key: bad} + - length: { aggregations.class.buckets.1.sig_text.buckets: 1 } + - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad} + - match: {aggregations.class.buckets.1.key: good} + - length: { aggregations.class.buckets.1.sig_text.buckets: 1 } + - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: good} + +--- +"Dedup noise": + - do: + indices.create: + index: noisy + body: + settings: + number_of_shards: "1" + mappings: + properties: + text: + type: text + fielddata: false + class: + type: keyword + + - do: + bulk: + index: noisy + refresh: true + body: + - '{ "index": {"_id": "1"} }' + - '{ "text": "good noisewords1 g1 g2 g3 g4 g5 g6", "class": "good" }' + - '{ "index": {"_id": "2"} }' + - '{ "text": "good noisewords2 g1 g2 g3 g4 g5 g6", "class": "good" }' + - '{ "index": {"_id": "3"} }' + - '{ "text": "bad noisewords3 b1 b2 b3 b4 b5 b6", "class": "bad" }' + - '{ "index": {"_id": "4"} }' + - '{ "text": "bad noisewords4 b1 b2 b3 b4 b5 b6", "class": "bad" }' + - '{ "index": {"_id": "5"} }' + - '{ "text": "good bad noisewords5 gb1 gb2 gb3 gb4 gb5 gb6", "class": "good" }' + - '{ "index": {"_id": "6"} }' + - '{ "text": "good bad noisewords6 gb1 gb2 gb3 gb4 gb5 gb6", "class": "bad" }' + - '{ "index": {"_id": "7"} }' + - '{ "text": "bad noisewords7 b1 b2 b3 b4 b5 b6", "class": "bad" }' + + - do: + search: + rest_total_hits_as_int: true + index: noisy + body: + size: 0 + aggs: + class: + terms: + field: class + aggs: + sig_text: + significant_text: + field: text + filter_duplicate_text: true + - match: {hits.total: 7} + - length: {aggregations.class.buckets: 2} + - match: {aggregations.class.buckets.0.key: bad} + - length: { aggregations.class.buckets.0.sig_text.buckets: 1 } + - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad} + - match: {aggregations.class.buckets.1.key: good} + - length: { aggregations.class.buckets.1.sig_text.buckets: 1 } + - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: good} + +--- +profile: + - skip: + version: " - 7.99.99" + reason: extra profiling added in 8.0.0 to be backported to 7.14.0 + + - do: + search: + index: goodbad + rest_total_hits_as_int: true + body: + size: 0 + profile: true + aggs: + class: + terms: + field: class + aggs: + sig_text: + significant_text: + field: text + - match: {hits.total: 7} + - length: {aggregations.class.buckets: 2} + - match: {aggregations.class.buckets.0.key: bad} + - length: { aggregations.class.buckets.0.sig_text.buckets: 1 } + - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad} + - match: {aggregations.class.buckets.1.key: good} + - length: { aggregations.class.buckets.1.sig_text.buckets: 1 } + - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: good} + - match: { profile.shards.0.aggregations.0.description: class } + - match: { profile.shards.0.aggregations.0.children.0.type: MapStringTermsAggregator } + - match: { profile.shards.0.aggregations.0.children.0.description: sig_text } + - match: { profile.shards.0.aggregations.0.children.0.debug.collection_strategy: analyze text from _source } + - match: { profile.shards.0.aggregations.0.children.0.debug.result_strategy: significant_terms } + - match: { profile.shards.0.aggregations.0.children.0.debug.total_buckets: 4 } + - match: { profile.shards.0.aggregations.0.children.0.debug.values_fetched: 7 } + - match: { profile.shards.0.aggregations.0.children.0.debug.chars_fetched: 33 } + - gt: { profile.shards.0.aggregations.0.children.0.debug.extract_ns: 0 } + - match: { profile.shards.0.aggregations.0.children.0.debug.extract_count: 7 } + - gt: { profile.shards.0.aggregations.0.children.0.debug.collect_analyzed_ns: 0 } + - match: { profile.shards.0.aggregations.0.children.0.debug.collect_analyzed_count: 9 } + +--- +include: + - do: + search: + index: goodbad + rest_total_hits_as_int: true + body: + size: 0 + aggs: + class: + terms: + field: class + aggs: + sig_text: + significant_text: + field: text + include: bad + - match: {hits.total: 7} + - length: {aggregations.class.buckets: 2} + - match: {aggregations.class.buckets.0.key: bad} + - length: { aggregations.class.buckets.0.sig_text.buckets: 1 } + - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad} + - match: {aggregations.class.buckets.1.key: good} + - length: { aggregations.class.buckets.1.sig_text.buckets: 0 } + +--- +exclude: + - do: + search: + index: goodbad + rest_total_hits_as_int: true + body: + size: 0 + aggs: + class: + terms: + field: class + aggs: + sig_text: + significant_text: + field: text + exclude: good + - match: {hits.total: 7} + - length: {aggregations.class.buckets: 2} + - match: {aggregations.class.buckets.0.key: bad} + - length: { aggregations.class.buckets.0.sig_text.buckets: 1 } + - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad} + - match: {aggregations.class.buckets.1.key: good} + - length: { aggregations.class.buckets.1.sig_text.buckets: 0 } + +--- +min_doc_count: + - do: + search: + index: goodbad + rest_total_hits_as_int: true + body: + size: 0 + aggs: + class: + terms: + field: class + aggs: + sig_text: + significant_text: + field: text + min_doc_count: 4 + - match: {hits.total: 7} + - length: {aggregations.class.buckets: 2} + - match: {aggregations.class.buckets.0.key: bad} + - length: { aggregations.class.buckets.0.sig_text.buckets: 1 } + - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad} + - match: {aggregations.class.buckets.1.key: good} + - length: { aggregations.class.buckets.1.sig_text.buckets: 0 } + +--- +size: + - do: + bulk: + index: goodbad + refresh: true + body: + - '{ "index": {"_id": "101"} }' + - '{ "text": "caterpillar eat snacks", "class": "good" }' + - '{ "index": {"_id": "102"} }' + - '{ "text": "caterpillar sick", "class": "good" }' + - '{ "index": {"_id": "103"} }' + - '{ "text": "caterpillar eat leaf", "class": "good" }' + - '{ "index": {"_id": "104"} }' + - '{ "text": "caterpillar build cocoon", "class": "good" }' + - '{ "index": {"_id": "105"} }' + - '{ "text": "caterpillar fly away", "class": "good" }' + + - do: + search: + index: goodbad + rest_total_hits_as_int: true + body: + size: 0 + aggs: + class: + terms: + field: class + aggs: + sig_text: + significant_text: + field: text + - match: {hits.total: 12} + - length: {aggregations.class.buckets: 2} + - match: {aggregations.class.buckets.0.key: good} + - length: { aggregations.class.buckets.0.sig_text.buckets: 2 } + - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: caterpillar} + - match: {aggregations.class.buckets.0.sig_text.buckets.1.key: good} + - match: {aggregations.class.buckets.1.key: bad} + - length: { aggregations.class.buckets.1.sig_text.buckets: 1 } + - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: bad} + + - do: + search: + index: goodbad + rest_total_hits_as_int: true + body: + size: 0 + aggs: + class: + terms: + field: class + aggs: + sig_text: + significant_text: + field: text + size: 1 + - match: {hits.total: 12} + - length: {aggregations.class.buckets: 2} + - match: {aggregations.class.buckets.0.key: good} + - length: { aggregations.class.buckets.0.sig_text.buckets: 1 } + - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: caterpillar} + - match: {aggregations.class.buckets.1.key: bad} + - length: { aggregations.class.buckets.1.sig_text.buckets: 1 } + - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: bad} + +--- +shard_size: + # We can't perform a great test for shard_size without lots of control over + # routing here and it isn't worh being that brittle. So we'll just test that + # we parse it. + - do: + search: + index: goodbad + rest_total_hits_as_int: true + body: + aggs: + class: + terms: + field: class + aggs: + sig_text: + significant_text: + field: text + size: 1 + shard_size: 1 + - match: {hits.total: 7} + - length: {aggregations.class.buckets: 2} + - match: {aggregations.class.buckets.0.key: bad} + - length: { aggregations.class.buckets.1.sig_text.buckets: 1 } + - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad} + - match: {aggregations.class.buckets.1.key: good} + - length: { aggregations.class.buckets.1.sig_text.buckets: 1 } + - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: good} + +--- +significance_heuristics: + - do: + bulk: + index: goodbad + refresh: true + body: + - '{ "index": {"_id": "101"} }' + - '{ "text": "caterpillar eat snacks", "class": "good" }' + - '{ "index": {"_id": "102"} }' + - '{ "text": "caterpillar sick", "class": "good" }' + - '{ "index": {"_id": "103"} }' + - '{ "text": "caterpillar eat leaf", "class": "good" }' + - '{ "index": {"_id": "104"} }' + - '{ "text": "caterpillar build cocoon", "class": "good" }' + - '{ "index": {"_id": "105"} }' + - '{ "text": "caterpillar fly away", "class": "good" }' + - '{ "index": {"_id": "107"} }' + - '{ "text": "caterpillar bad", "class": "bad" }' + - '{ "index": {"_id": "108"} }' + - '{ "text": "caterpillar very bad", "class": "bad" }' + - '{ "index": {"_id": "110"} }' + - '{ "text": "caterpillar will eat you", "class": "bad" }' + - '{ "index": {"_id": "110"} }' + - '{ "text": "caterpillar is the enemy", "class": "bad" }' + - '{ "index": {"_id": "113"} }' + - '{ "text": "good", "class": "good" }' + - '{ "index": {"_id": "114"} }' + - '{ "text": "good", "class": "good" }' + + - do: + search: + index: goodbad + rest_total_hits_as_int: true + body: + query: + match: + class: good + size: 0 + aggs: + sig_text: + significant_text: + field: text + gnd: {} + - match: {hits.total: 10} + - length: {aggregations.sig_text.buckets: 2} + - match: {aggregations.sig_text.buckets.0.key: good} + - match: {aggregations.sig_text.buckets.1.key: caterpillar} + + # mutual_information doesn't think `caterpillar` is significant because + # it shows up so much in the backgound set. + - do: + search: + index: goodbad + rest_total_hits_as_int: true + body: + query: + match: + class: good + size: 0 + aggs: + sig_text: + significant_text: + field: text + mutual_information: {} + - match: {hits.total: 10} + - length: {aggregations.sig_text.buckets: 1} + - match: {aggregations.sig_text.buckets.0.key: good} + +--- +background_filter: + - do: + indices.create: + index: goodbadugly + body: + settings: + number_of_shards: "1" + mappings: + properties: + text: + type: text + fielddata: false + class: + type: keyword + ugly: + type: boolean + + - do: + bulk: + index: goodbadugly + refresh: true + body: + - '{ "index": {"_id": "1"} }' + - '{ "text": "good", "class": "good", "ugly": true }' + - '{ "index": {"_id": "2"} }' + - '{ "text": "good", "class": "good", "ugly": true }' + - '{ "index": {"_id": "3"} }' + - '{ "text": "bad", "class": "bad" }' + - '{ "index": {"_id": "4"} }' + - '{ "text": "bad", "class": "bad" }' + - '{ "index": {"_id": "5"} }' + - '{ "text": "good bad", "class": "good" }' + - '{ "index": {"_id": "6"} }' + - '{ "text": "good bad", "class": "bad" }' + - '{ "index": {"_id": "7"} }' + - '{ "text": "bad", "class": "bad" }' + + - do: + search: + index: goodbadugly + rest_total_hits_as_int: true + body: + size: 0 + aggs: + class: + terms: + field: class + aggs: + sig_text: + significant_text: + field: text + # only use background frequency information from "ugly" + # documents. All "ugly" documents have the "good" text so + # so "good" isn't significant at all! + background_filter: + match: + ugly: true + - match: {hits.total: 7} + - length: {aggregations.class.buckets: 2} + - match: {aggregations.class.buckets.0.key: bad} + - length: { aggregations.class.buckets.0.sig_text.buckets: 1 } + - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad} + - match: {aggregations.class.buckets.1.key: good} + - length: { aggregations.class.buckets.1.sig_text.buckets: 0 } + +--- +copy_to: + # Tests the special configuration that `significant_text` needs in order to + # do sensible things with fields built with `copy_to`. + - do: + indices.create: + index: has_copy_to + body: + settings: + number_of_shards: "1" + mappings: + properties: + text: + type: text + fielddata: false + class: + type: keyword + a: + type: keyword + index: false + doc_values: false + copy_to: a + b: + type: keyword + index: false + doc_values: false + copy_to: a + + - do: + bulk: + index: has_copy_to + refresh: true + body: + - '{ "index": {"_id": "1"} }' + - '{ "a": "good", "class": "good" }' + - '{ "index": {"_id": "2"} }' + - '{ "b": "good", "class": "good" }' + - '{ "index": {"_id": "3"} }' + - '{ "a": "bad", "class": "bad" }' + - '{ "index": {"_id": "4"} }' + - '{ "b": "bad", "class": "bad" }' + - '{ "index": {"_id": "5"} }' + - '{ "a": "good", "b": "bad", "class": "good" }' + - '{ "index": {"_id": "6"} }' + - '{ "b": "good bad", "class": "bad" }' + - '{ "index": {"_id": "7"} }' + - '{ "a": "bad", "b": "", "class": "bad" }' + + - do: + search: + index: has_copy_to + rest_total_hits_as_int: true + body: + size: 0 + aggs: + class: + terms: + field: class + aggs: + sig_text: + significant_text: + field: text + source_fields: [a, b] + - match: {hits.total: 7} + - length: {aggregations.class.buckets: 2} + - match: {aggregations.class.buckets.0.key: bad} + - length: { aggregations.class.buckets.0.sig_text.buckets: 1 } + - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad} + - match: {aggregations.class.buckets.1.key: good} + - length: { aggregations.class.buckets.1.sig_text.buckets: 1 } + - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: good} diff --git a/server/src/internalClusterTest/java/org/elasticsearch/search/profile/aggregation/AggregationProfilerIT.java b/server/src/internalClusterTest/java/org/elasticsearch/search/profile/aggregation/AggregationProfilerIT.java index 720e0e3f509f9..2d52cb6d58aec 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/search/profile/aggregation/AggregationProfilerIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/search/profile/aggregation/AggregationProfilerIT.java @@ -224,12 +224,7 @@ private void assertRemapTermsDebugInfo(ProfileResult termsAggResult) { assertThat(termsAggResult.getDebugInfo(), hasEntry(COLLECTION_STRAT, "remap using many bucket ords")); assertThat(termsAggResult.getDebugInfo(), hasEntry(RESULT_STRAT, "terms")); assertThat(termsAggResult.getDebugInfo(), hasEntry(HAS_FILTER, false)); - // TODO we only index single valued docs but the ordinals ends up with multi valued sometimes - assertThat( - termsAggResult.getDebugInfo().toString(), - (int) termsAggResult.getDebugInfo().get(SEGMENTS_WITH_SINGLE) + (int) termsAggResult.getDebugInfo().get(SEGMENTS_WITH_MULTI), - greaterThan(0) - ); + assertThat(termsAggResult.getDebugInfo().toString(), (int) termsAggResult.getDebugInfo().get(SEGMENTS_WITH_SINGLE), greaterThan(0)); } public void testMultiLevelProfileBreadthFirst() { diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java index 105a5ed7657f7..532ce02a16d78 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java @@ -30,6 +30,7 @@ import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic; import org.elasticsearch.search.aggregations.support.AggregationContext; import org.elasticsearch.search.aggregations.support.ValuesSource; +import org.elasticsearch.search.aggregations.support.ValuesSourceConfig; import java.io.IOException; import java.util.Arrays; @@ -117,6 +118,8 @@ public InternalAggregation buildEmptyAggregation() { public void collectDebugInfo(BiConsumer add) { super.collectDebugInfo(add); add.accept("total_buckets", bucketOrds.size()); + add.accept("collection_strategy", collectorSource.describe()); + collectorSource.collectDebugInfo(add); add.accept("result_strategy", resultStrategy.describe()); } @@ -126,11 +129,30 @@ public void doClose() { } /** - * Abstaction on top of building collectors to fetch values. + * Abstraction on top of building collectors to fetch values so {@code terms}, + * {@code significant_terms}, and {@code significant_text} can share a bunch of + * aggregation code. */ public interface CollectorSource extends Releasable { + /** + * A description of the strategy to include in profile results. + */ + String describe(); + + /** + * Collect debug information to add to the profiling results. This will + * only be called if the aggregation is being profiled. + */ + void collectDebugInfo(BiConsumer add); + + /** + * Does this {@link CollectorSource} need queries to calculate the score? + */ boolean needsScores(); + /** + * Build the collector. + */ LeafBucketCollector getLeafCollector( IncludeExclude.StringFilter includeExclude, LeafReaderContext ctx, @@ -148,15 +170,23 @@ public interface CollectConsumer { * Fetch values from a {@link ValuesSource}. */ public static class ValuesSourceCollectorSource implements CollectorSource { - private final ValuesSource valuesSource; + private final ValuesSourceConfig valuesSourceConfig; - public ValuesSourceCollectorSource(ValuesSource valuesSource) { - this.valuesSource = valuesSource; + public ValuesSourceCollectorSource(ValuesSourceConfig valuesSourceConfig) { + this.valuesSourceConfig = valuesSourceConfig; + } + + @Override + public String describe() { + return "from " + valuesSourceConfig.getDescription(); } + @Override + public void collectDebugInfo(BiConsumer add) {} + @Override public boolean needsScores() { - return valuesSource.needsScores(); + return valuesSourceConfig.getValuesSource().needsScores(); } @Override @@ -167,7 +197,7 @@ public LeafBucketCollector getLeafCollector( LongConsumer addRequestCircuitBreakerBytes, CollectConsumer consumer ) throws IOException { - SortedBinaryDocValues values = valuesSource.bytesValues(ctx); + SortedBinaryDocValues values = valuesSourceConfig.getValuesSource().bytesValues(ctx); return new LeafBucketCollectorBase(sub, values) { final BytesRefBuilder previous = new BytesRefBuilder(); diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregationBuilder.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregationBuilder.java index 4be4ff1727355..d640686a7bac7 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregationBuilder.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregationBuilder.java @@ -42,7 +42,6 @@ public class SignificantTermsAggregationBuilder extends ValuesSourceAggregationB new ValuesSourceRegistry.RegistryKey<>(NAME, SignificantTermsAggregatorSupplier.class); static final ParseField BACKGROUND_FILTER = new ParseField("background_filter"); - static final ParseField HEURISTIC = new ParseField("significance_heuristic"); static final TermsAggregator.BucketCountThresholds DEFAULT_BUCKET_COUNT_THRESHOLDS = new TermsAggregator.BucketCountThresholds( 3, 0, 10, -1); diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorFactory.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorFactory.java index a6c0b7f623a72..9858b52e919e2 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorFactory.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorFactory.java @@ -57,7 +57,7 @@ private static SignificantTermsAggregatorSupplier bytesSupplier() { @Override public Aggregator build(String name, AggregatorFactories factories, - ValuesSource valuesSource, + ValuesSourceConfig valuesSourceConfig, DocValueFormat format, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, @@ -73,7 +73,7 @@ public Aggregator build(String name, if (executionHint != null) { execution = ExecutionMode.fromString(executionHint, deprecationLogger); } - if (valuesSource instanceof ValuesSource.Bytes.WithOrdinals == false) { + if (valuesSourceConfig.hasOrdinals() == false) { execution = ExecutionMode.MAP; } if (execution == null) { @@ -86,7 +86,7 @@ public Aggregator build(String name, + "include/exclude clauses"); } - return execution.create(name, factories, valuesSource, format, bucketCountThresholds, includeExclude, context, parent, + return execution.create(name, factories, valuesSourceConfig, format, bucketCountThresholds, includeExclude, context, parent, significanceHeuristic, lookup, cardinality, metadata); } }; @@ -101,7 +101,7 @@ private static SignificantTermsAggregatorSupplier numericSupplier() { @Override public Aggregator build(String name, AggregatorFactories factories, - ValuesSource valuesSource, + ValuesSourceConfig valuesSourceConfig, DocValueFormat format, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, @@ -119,7 +119,7 @@ public Aggregator build(String name, "values for include/exclude clauses used to filter numeric fields"); } - ValuesSource.Numeric numericValuesSource = (ValuesSource.Numeric) valuesSource; + ValuesSource.Numeric numericValuesSource = (ValuesSource.Numeric) valuesSourceConfig.getValuesSource(); if (numericValuesSource.isFloatingPoint()) { throw new UnsupportedOperationException("No support for examining floating point numerics"); } @@ -218,7 +218,7 @@ protected Aggregator doCreateInternal( return aggregatorSupplier.build( name, factories, - config.getValuesSource(), + config, config.format(), bucketCountThresholds, includeExclude, @@ -239,7 +239,7 @@ public enum ExecutionMode { @Override Aggregator create(String name, AggregatorFactories factories, - ValuesSource valuesSource, + ValuesSourceConfig valuesSourceConfig, DocValueFormat format, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, @@ -254,7 +254,7 @@ Aggregator create(String name, return new MapStringTermsAggregator( name, factories, - new MapStringTermsAggregator.ValuesSourceCollectorSource(valuesSource), + new MapStringTermsAggregator.ValuesSourceCollectorSource(valuesSourceConfig), a -> a.new SignificantTermsResults(lookup, significanceHeuristic, cardinality), null, format, @@ -276,7 +276,7 @@ Aggregator create(String name, @Override Aggregator create(String name, AggregatorFactories factories, - ValuesSource valuesSource, + ValuesSourceConfig valuesSourceConfig, DocValueFormat format, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, @@ -298,7 +298,8 @@ Aggregator create(String name, remapGlobalOrd = false; } - ValuesSource.Bytes.WithOrdinals.FieldData ordinalsValuesSource = (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource; + ValuesSource.Bytes.WithOrdinals.FieldData ordinalsValuesSource = + (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSourceConfig.getValuesSource(); SortedSetDocValues values = TermsAggregatorFactory.globalOrdsValues(context, ordinalsValuesSource); return new GlobalOrdinalsStringTermsAggregator( name, @@ -342,7 +343,7 @@ public static ExecutionMode fromString(String value, final DeprecationLogger dep abstract Aggregator create(String name, AggregatorFactories factories, - ValuesSource valuesSource, + ValuesSourceConfig valuesSourceConfig, DocValueFormat format, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorSupplier.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorSupplier.java index a0e83ef5fef3e..a92279f5e20b5 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorSupplier.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorSupplier.java @@ -13,7 +13,7 @@ import org.elasticsearch.search.aggregations.CardinalityUpperBound; import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic; import org.elasticsearch.search.aggregations.support.AggregationContext; -import org.elasticsearch.search.aggregations.support.ValuesSource; +import org.elasticsearch.search.aggregations.support.ValuesSourceConfig; import java.io.IOException; import java.util.Map; @@ -21,7 +21,7 @@ interface SignificantTermsAggregatorSupplier { Aggregator build(String name, AggregatorFactories factories, - ValuesSource valuesSource, + ValuesSourceConfig valuesSourceConfig, DocValueFormat format, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTextAggregatorFactory.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTextAggregatorFactory.java index 6df39a3e26c27..6bb24b29e1783 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTextAggregatorFactory.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTextAggregatorFactory.java @@ -36,14 +36,18 @@ import org.elasticsearch.search.aggregations.bucket.BucketUtils; import org.elasticsearch.search.aggregations.bucket.terms.IncludeExclude.StringFilter; import org.elasticsearch.search.aggregations.bucket.terms.MapStringTermsAggregator.CollectConsumer; +import org.elasticsearch.search.aggregations.bucket.terms.MapStringTermsAggregator.CollectorSource; import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregator.BucketCountThresholds; import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic; import org.elasticsearch.search.aggregations.support.AggregationContext; import org.elasticsearch.search.lookup.SourceLookup; +import org.elasticsearch.search.profile.Timer; import java.io.IOException; import java.util.Iterator; +import java.util.List; import java.util.Map; +import java.util.function.BiConsumer; import java.util.function.LongConsumer; public class SignificantTextAggregatorFactory extends AggregatorFactory { @@ -66,7 +70,7 @@ public SignificantTextAggregatorFactory(String name, AggregatorFactory parent, AggregatorFactories.Builder subFactoriesBuilder, String fieldName, - String [] sourceFieldNames, + String[] sourceFieldNames, boolean filterDuplicateText, Map metadata) throws IOException { super(name, context, parent, subFactoriesBuilder, metadata); @@ -76,7 +80,7 @@ public SignificantTextAggregatorFactory(String name, if (supportsAgg(fieldType) == false) { throw new IllegalArgumentException("Field [" + fieldType.name() + "] has no analyzer, but SignificantText " + "requires an analyzed field"); - } + } String indexedFieldName = fieldType.name(); this.sourceFieldNames = sourceFieldNames == null ? new String[] {indexedFieldName} : sourceFieldNames; } else { @@ -89,7 +93,7 @@ public SignificantTextAggregatorFactory(String name, this.bucketCountThresholds = bucketCountThresholds; this.significanceHeuristic = significanceHeuristic; } - + protected Aggregator createUnmapped(Aggregator parent, Map metadata) throws IOException { final InternalAggregation aggregation = new UnmappedSignificantTerms(name, bucketCountThresholds.getRequiredSize(), bucketCountThresholds.getMinDocCount(), metadata); @@ -99,7 +103,7 @@ public InternalAggregation buildEmptyAggregation() { return aggregation; } }; - } + } private static boolean supportsAgg(MappedFieldType ft) { return ft.getTextSearchInfo() != TextSearchInfo.NONE @@ -109,11 +113,11 @@ private static boolean supportsAgg(MappedFieldType ft) { @Override protected Aggregator createInternal(Aggregator parent, CardinalityUpperBound cardinality, Map metadata) throws IOException { - + if (fieldType == null) { return createUnmapped(parent, metadata); } - + BucketCountThresholds bucketCountThresholds = new BucketCountThresholds(this.bucketCountThresholds); if (bucketCountThresholds.getShardSize() == SignificantTextAggregationBuilder.DEFAULT_BUCKET_COUNT_THRESHOLDS.getShardSize()) { // The user has not made a shardSize selection. @@ -133,21 +137,12 @@ protected Aggregator createInternal(Aggregator parent, CardinalityUpperBound car IncludeExclude.StringFilter incExcFilter = includeExclude == null ? null: includeExclude.convertToStringFilter(DocValueFormat.RAW); - MapStringTermsAggregator.CollectorSource collectorSource = new SignificantTextCollectorSource( - context.lookup().source(), - context.bigArrays(), - fieldType, - context.getIndexAnalyzer(f -> { - throw new IllegalArgumentException("No analyzer configured for field " + f); - }), - sourceFieldNames, - filterDuplicateText - ); + SignificanceLookup lookup = new SignificanceLookup(context, fieldType, DocValueFormat.RAW, backgroundFilter); return new MapStringTermsAggregator( name, factories, - collectorSource, + createCollectorSource(), a -> a.new SignificantTermsResults(lookup, significanceHeuristic, cardinality), null, DocValueFormat.RAW, @@ -162,12 +157,58 @@ protected Aggregator createInternal(Aggregator parent, CardinalityUpperBound car ); } + /** + * Create the {@link CollectorSource}, gathering some timing information + * if we're profiling. + *

+ * When profiling aggregations {@link LeafBucketCollector#collect(int, long)} method + * out of the box but our implementation of that method does three things that is + * useful to get timing for: + *

    + *
  • Fetch field values from {@code _source} + *
  • Analyze the field + *
  • Do all the normal {@code terms} agg stuff with its terms + *
+ *

+ * The most convenient way to measure all of these is to time the fetch and all + * the normal {@code terms} agg stuff. You can then subtract those timings from + * the overall collect time to get the analyze time. You can also get the total + * number of terms that we analyzed by looking at the invocation count on the + * {@code terms} agg stuff. + *

+ * While we're at it we count the number of values we fetch from source. + */ + private CollectorSource createCollectorSource() { + Analyzer analyzer = context.getIndexAnalyzer(f -> { + throw new IllegalArgumentException("No analyzer configured for field " + f); + }); + if (context.profiling()) { + return new ProfilingSignificantTextCollectorSource( + context.lookup().source(), + context.bigArrays(), + fieldType, + analyzer, + sourceFieldNames, + filterDuplicateText + ); + } + return new SignificantTextCollectorSource( + context.lookup().source(), + context.bigArrays(), + fieldType, + analyzer, + sourceFieldNames, + filterDuplicateText + ); + } + private static class SignificantTextCollectorSource implements MapStringTermsAggregator.CollectorSource { private final SourceLookup sourceLookup; private final BigArrays bigArrays; private final MappedFieldType fieldType; private final Analyzer analyzer; private final String[] sourceFieldNames; + private final BytesRefBuilder scratch = new BytesRefBuilder(); private ObjectArray dupSequenceSpotters; SignificantTextCollectorSource( @@ -186,6 +227,15 @@ private static class SignificantTextCollectorSource implements MapStringTermsAgg dupSequenceSpotters = filterDuplicateText ? bigArrays.newObjectArray(1) : null; } + @Override + public String describe() { + return "analyze " + fieldType.name() + " from _source"; + } + + @Override + public void collectDebugInfo(BiConsumer add) { + } + @Override public boolean needsScores() { return false; @@ -200,8 +250,6 @@ public LeafBucketCollector getLeafCollector( CollectConsumer consumer ) throws IOException { return new LeafBucketCollectorBase(sub, null) { - private final BytesRefBuilder scratch = new BytesRefBuilder(); - @Override public void collect(int doc, long owningBucketOrd) throws IOException { if (dupSequenceSpotters == null) { @@ -224,7 +272,7 @@ private void collectFromSource(int doc, long owningBucketOrd, DuplicateByteSeque try { for (String sourceField : sourceFieldNames) { - Iterator itr = sourceLookup.extractRawValues(sourceField).stream() + Iterator itr = extractRawValues(sourceField).stream() .map(obj -> { if (obj == null) { return null; @@ -236,63 +284,87 @@ private void collectFromSource(int doc, long owningBucketOrd, DuplicateByteSeque }) .iterator(); while (itr.hasNext()) { - TokenStream ts = analyzer.tokenStream(fieldType.name(), itr.next()); - processTokenStream(doc, owningBucketOrd, ts, inDocTerms, spotter); + String text = itr.next(); + TokenStream ts = analyzer.tokenStream(fieldType.name(), text); + processTokenStream( + includeExclude, + doc, + owningBucketOrd, + text, + ts, + inDocTerms, + spotter, + addRequestCircuitBreakerBytes, + sub, + consumer + ); } } } finally { Releasables.close(inDocTerms); } } + }; + } - private void processTokenStream( - int doc, - long owningBucketOrd, - TokenStream ts, - BytesRefHash inDocTerms, - DuplicateByteSequenceSpotter spotter - ) throws IOException { - long lastTrieSize = 0; + protected void processTokenStream( + StringFilter includeExclude, + int doc, + long owningBucketOrd, + String text, + TokenStream ts, + BytesRefHash inDocTerms, + DuplicateByteSequenceSpotter spotter, + LongConsumer addRequestCircuitBreakerBytes, + LeafBucketCollector sub, + CollectConsumer consumer + ) throws IOException { + long lastTrieSize = 0; + if (spotter != null) { + lastTrieSize = spotter.getEstimatedSizeInBytes(); + ts = new DeDuplicatingTokenFilter(ts, spotter); + } + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + ts.reset(); + try { + while (ts.incrementToken()) { if (spotter != null) { - lastTrieSize = spotter.getEstimatedSizeInBytes(); - ts = new DeDuplicatingTokenFilter(ts, spotter); + long newTrieSize = spotter.getEstimatedSizeInBytes(); + long growth = newTrieSize - lastTrieSize; + // Only update the circuitbreaker after + if (growth > MEMORY_GROWTH_REPORTING_INTERVAL_BYTES) { + addRequestCircuitBreakerBytes.accept(growth); + lastTrieSize = newTrieSize; + } } - CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); - ts.reset(); - try { - while (ts.incrementToken()) { - if (spotter != null) { - long newTrieSize = spotter.getEstimatedSizeInBytes(); - long growth = newTrieSize - lastTrieSize; - // Only update the circuitbreaker after - if (growth > MEMORY_GROWTH_REPORTING_INTERVAL_BYTES) { - addRequestCircuitBreakerBytes.accept(growth); - lastTrieSize = newTrieSize; - } - } - scratch.clear(); - scratch.copyChars(termAtt); - BytesRef bytes = scratch.get(); - if (includeExclude != null && false == includeExclude.accept(bytes)) { - continue; - } - if (inDocTerms.add(bytes) < 0) { - continue; - } - consumer.accept(sub, doc, owningBucketOrd, bytes); - } - } finally { - ts.close(); + scratch.clear(); + scratch.copyChars(termAtt); + BytesRef bytes = scratch.get(); + if (includeExclude != null && false == includeExclude.accept(bytes)) { + continue; } - if (spotter != null) { - long growth = spotter.getEstimatedSizeInBytes() - lastTrieSize; - if (growth > 0) { - addRequestCircuitBreakerBytes.accept(growth); - } + if (inDocTerms.add(bytes) < 0) { + continue; } + consumer.accept(sub, doc, owningBucketOrd, bytes); } - }; + } finally { + ts.close(); + } + if (spotter != null) { + long growth = spotter.getEstimatedSizeInBytes() - lastTrieSize; + if (growth > 0) { + addRequestCircuitBreakerBytes.accept(growth); + } + } + } + + /** + * Extract values from {@code _source}. + */ + protected List extractRawValues(String field) { + return sourceLookup.extractRawValues(field); } @Override @@ -300,4 +372,79 @@ public void close() { Releasables.close(dupSequenceSpotters); } } + + private static class ProfilingSignificantTextCollectorSource extends SignificantTextCollectorSource { + private final Timer extract = new Timer(); + private final Timer collectAnalyzed = new Timer(); + private long valuesFetched; + private long charsFetched; + + private ProfilingSignificantTextCollectorSource( + SourceLookup sourceLookup, + BigArrays bigArrays, + MappedFieldType fieldType, + Analyzer analyzer, + String[] sourceFieldNames, + boolean filterDuplicateText + ) { + super(sourceLookup, bigArrays, fieldType, analyzer, sourceFieldNames, filterDuplicateText); + } + + @Override + protected void processTokenStream( + StringFilter includeExclude, + int doc, + long owningBucketOrd, + String text, + TokenStream ts, + BytesRefHash inDocTerms, + DuplicateByteSequenceSpotter spotter, + LongConsumer addRequestCircuitBreakerBytes, + LeafBucketCollector sub, + CollectConsumer consumer + ) throws IOException { + valuesFetched++; + charsFetched += text.length(); + super.processTokenStream( + includeExclude, + doc, + owningBucketOrd, + text, + ts, + inDocTerms, + spotter, + addRequestCircuitBreakerBytes, + sub, + (subCollector, d, o, bytes) -> { + collectAnalyzed.start(); + try { + consumer.accept(subCollector, d, o, bytes); + } finally { + collectAnalyzed.stop(); + } + } + ); + } + + @Override + protected List extractRawValues(String field) { + extract.start(); + try { + return super.extractRawValues(field); + } finally { + extract.stop(); + } + } + + @Override + public void collectDebugInfo(BiConsumer add) { + super.collectDebugInfo(add); + add.accept("extract_ns", extract.getApproximateTiming()); + add.accept("extract_count", extract.getCount()); + add.accept("collect_analyzed_ns", collectAnalyzed.getApproximateTiming()); + add.accept("collect_analyzed_count", collectAnalyzed.getCount()); + add.accept("values_fetched", valuesFetched); + add.accept("chars_fetched", charsFetched); + } + } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java index e74a26255fda8..d8aaa34132e7a 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java @@ -325,7 +325,7 @@ Aggregator create(String name, return new MapStringTermsAggregator( name, factories, - new MapStringTermsAggregator.ValuesSourceCollectorSource(valuesSourceConfig.getValuesSource()), + new MapStringTermsAggregator.ValuesSourceCollectorSource(valuesSourceConfig), a -> a.new StandardTermsResults(valuesSourceConfig.getValuesSource()), order, valuesSourceConfig.format(),