From 8962f54f36ee9d0de3f43538c127bfc236975432 Mon Sep 17 00:00:00 2001 From: David Roberts Date: Wed, 5 Jun 2019 13:34:01 +0100 Subject: [PATCH 1/3] [ML] Add earliest and latest timestamps to field stats This change adds the earliest and latest timestamps into the field stats for fields of type "date" in the output of the ML find_file_structure endpoint. This will enable the cards for date fields in the file data visualizer in the UI to be made to look more similar to the cards for date fields in the index data visualizer in the UI. --- .../ml/filestructurefinder/FieldStats.java | 30 +++++- .../filestructurefinder/FieldStatsTests.java | 10 +- .../ml/apis/find-file-structure.asciidoc | 10 ++ .../ml/filestructurefinder/FieldStats.java | 57 +++++++++- .../filestructurefinder/FieldStatsTests.java | 10 +- .../DelimitedFileStructureFinder.java | 3 +- .../FieldStatsCalculator.java | 87 +++++++++++++-- .../FileStructureUtils.java | 9 +- .../GrokPatternCreator.java | 97 +++++++++-------- .../NdJsonFileStructureFinder.java | 3 +- .../TextLogFileStructureFinder.java | 11 +- .../XmlFileStructureFinder.java | 3 +- .../FieldStatsCalculatorTests.java | 102 +++++++++++++++--- .../FileStructureUtilsTests.java | 6 +- .../GrokPatternCreatorTests.java | 20 ++-- 15 files changed, 360 insertions(+), 98 deletions(-) diff --git a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/filestructurefinder/FieldStats.java b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/filestructurefinder/FieldStats.java index 4391d03f6d940..adb8e68393e82 100644 --- a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/filestructurefinder/FieldStats.java +++ b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/filestructurefinder/FieldStats.java @@ -38,12 +38,14 @@ public class FieldStats implements ToXContentObject { public static final ParseField MAX_VALUE = new ParseField("max_value"); public static final ParseField MEAN_VALUE = new ParseField("mean_value"); public static final ParseField MEDIAN_VALUE = new ParseField("median_value"); + public static final ParseField EARLIEST = new ParseField("earliest"); + public static final ParseField LATEST = new ParseField("latest"); public static final ParseField TOP_HITS = new ParseField("top_hits"); @SuppressWarnings("unchecked") public static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>("field_stats", true, a -> new FieldStats((long) a[0], (int) a[1], (Double) a[2], (Double) a[3], (Double) a[4], (Double) a[5], - (List>) a[6])); + (String) a[6], (String) a[7], (List>) a[8])); static { PARSER.declareLong(ConstructingObjectParser.constructorArg(), COUNT); @@ -52,6 +54,8 @@ public class FieldStats implements ToXContentObject { PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MAX_VALUE); PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MEAN_VALUE); PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MEDIAN_VALUE); + PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), EARLIEST); + PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), LATEST); PARSER.declareObjectArray(ConstructingObjectParser.optionalConstructorArg(), (p, c) -> p.mapOrdered(), TOP_HITS); } @@ -61,16 +65,20 @@ public class FieldStats implements ToXContentObject { private final Double maxValue; private final Double meanValue; private final Double medianValue; + private final String earliestTimestamp; + private final String latestTimestamp; private final List> topHits; FieldStats(long count, int cardinality, Double minValue, Double maxValue, Double meanValue, Double medianValue, - List> topHits) { + String earliestTimestamp, String latestTimestamp, List> topHits) { this.count = count; this.cardinality = cardinality; this.minValue = minValue; this.maxValue = maxValue; this.meanValue = meanValue; this.medianValue = medianValue; + this.earliestTimestamp = earliestTimestamp; + this.latestTimestamp = latestTimestamp; this.topHits = (topHits == null) ? Collections.emptyList() : Collections.unmodifiableList(topHits); } @@ -98,6 +106,14 @@ public Double getMedianValue() { return medianValue; } + public String getEarliestTimestamp() { + return earliestTimestamp; + } + + public String getLatestTimestamp() { + return latestTimestamp; + } + public List> getTopHits() { return topHits; } @@ -120,6 +136,12 @@ public XContentBuilder toXContent(XContentBuilder builder, ToXContent.Params par if (medianValue != null) { builder.field(MEDIAN_VALUE.getPreferredName(), toIntegerIfInteger(medianValue)); } + if (earliestTimestamp != null) { + builder.field(EARLIEST.getPreferredName(), earliestTimestamp); + } + if (latestTimestamp != null) { + builder.field(LATEST.getPreferredName(), latestTimestamp); + } if (topHits.isEmpty() == false) { builder.field(TOP_HITS.getPreferredName(), topHits); } @@ -140,7 +162,7 @@ static Number toIntegerIfInteger(double d) { @Override public int hashCode() { - return Objects.hash(count, cardinality, minValue, maxValue, meanValue, medianValue, topHits); + return Objects.hash(count, cardinality, minValue, maxValue, meanValue, medianValue, earliestTimestamp, latestTimestamp, topHits); } @Override @@ -161,6 +183,8 @@ public boolean equals(Object other) { Objects.equals(this.maxValue, that.maxValue) && Objects.equals(this.meanValue, that.meanValue) && Objects.equals(this.medianValue, that.medianValue) && + Objects.equals(this.earliestTimestamp, that.earliestTimestamp) && + Objects.equals(this.latestTimestamp, that.latestTimestamp) && Objects.equals(this.topHits, that.topHits); } } diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/filestructurefinder/FieldStatsTests.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/filestructurefinder/FieldStatsTests.java index daf6c4af90ddc..0d6e5e70290d9 100644 --- a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/filestructurefinder/FieldStatsTests.java +++ b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/filestructurefinder/FieldStatsTests.java @@ -43,6 +43,8 @@ static FieldStats createTestFieldStats() { Double maxValue = null; Double meanValue = null; Double medianValue = null; + String earliestTimestamp = null; + String latestTimestamp = null; boolean isMetric = randomBoolean(); if (isMetric) { if (randomBoolean()) { @@ -54,6 +56,12 @@ static FieldStats createTestFieldStats() { } meanValue = randomDouble(); medianValue = randomDouble(); + } else { + boolean isDate = randomBoolean(); + if (isDate) { + earliestTimestamp = randomAlphaOfLength(20); + latestTimestamp = randomAlphaOfLength(20); + } } List> topHits = new ArrayList<>(); @@ -68,7 +76,7 @@ static FieldStats createTestFieldStats() { topHits.add(topHit); } - return new FieldStats(count, cardinality, minValue, maxValue, meanValue, medianValue, topHits); + return new FieldStats(count, cardinality, minValue, maxValue, meanValue, medianValue, earliestTimestamp, latestTimestamp, topHits); } @Override diff --git a/docs/reference/ml/apis/find-file-structure.asciidoc b/docs/reference/ml/apis/find-file-structure.asciidoc index b0835ff9466bb..f3029635bf44e 100644 --- a/docs/reference/ml/apis/find-file-structure.asciidoc +++ b/docs/reference/ml/apis/find-file-structure.asciidoc @@ -445,6 +445,8 @@ If the request does not encounter errors, you receive the following result: "release_date" : { "count" : 24, "cardinality" : 20, + "earliest" : "1932-06-01", + "latest" : "2011-06-02", "top_hits" : [ { "value" : "1985-06-01", @@ -1152,6 +1154,8 @@ If the request does not encounter errors, you receive the following result: "tpep_dropoff_datetime" : { "count" : 19998, "cardinality" : 9066, + "earliest" : "2018-05-31 06:18:15", + "latest" : "2018-06-02 02:25:44", "top_hits" : [ { "value" : "2018-06-01 01:12:12", @@ -1198,6 +1202,8 @@ If the request does not encounter errors, you receive the following result: "tpep_pickup_datetime" : { "count" : 19998, "cardinality" : 8760, + "earliest" : "2018-05-31 06:08:31", + "latest" : "2018-06-02 01:21:21", "top_hits" : [ { "value" : "2018-06-01 00:01:23", @@ -1457,6 +1463,8 @@ this: "timestamp" : { "count" : 53, "cardinality" : 28, + "earliest" : "2018-09-27T14:39:28,518", + "latest" : "2018-09-27T14:39:37,012", "top_hits" : [ { "value" : "2018-09-27T14:39:29,859", @@ -1719,6 +1727,8 @@ this: "timestamp" : { "count" : 53, "cardinality" : 28, + "earliest" : "2018-09-27T14:39:28,518", + "latest" : "2018-09-27T14:39:37,012", "top_hits" : [ { "value" : "2018-09-27T14:39:29,859", diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStats.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStats.java index b371ca739bb6b..95abf16aa1444 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStats.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStats.java @@ -5,7 +5,9 @@ */ package org.elasticsearch.xpack.core.ml.filestructurefinder; +import org.elasticsearch.Version; import org.elasticsearch.common.ParseField; +import org.elasticsearch.common.Strings; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Writeable; @@ -27,12 +29,14 @@ public class FieldStats implements ToXContentObject, Writeable { static final ParseField MAX_VALUE = new ParseField("max_value"); static final ParseField MEAN_VALUE = new ParseField("mean_value"); static final ParseField MEDIAN_VALUE = new ParseField("median_value"); + static final ParseField EARLIEST = new ParseField("earliest"); + static final ParseField LATEST = new ParseField("latest"); static final ParseField TOP_HITS = new ParseField("top_hits"); @SuppressWarnings("unchecked") public static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>("field_stats", false, a -> new FieldStats((long) a[0], (int) a[1], (Double) a[2], (Double) a[3], (Double) a[4], (Double) a[5], - (List>) a[6])); + (String) a[6], (String) a[7], (List>) a[8])); static { PARSER.declareLong(ConstructingObjectParser.constructorArg(), COUNT); @@ -41,6 +45,8 @@ public class FieldStats implements ToXContentObject, Writeable { PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MAX_VALUE); PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MEAN_VALUE); PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MEDIAN_VALUE); + PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), EARLIEST); + PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), LATEST); PARSER.declareObjectArray(ConstructingObjectParser.optionalConstructorArg(), (p, c) -> p.mapOrdered(), TOP_HITS); } @@ -50,20 +56,33 @@ public class FieldStats implements ToXContentObject, Writeable { private final Double maxValue; private final Double meanValue; private final Double medianValue; + private final String earliestTimestamp; + private final String latestTimestamp; private final List> topHits; public FieldStats(long count, int cardinality, List> topHits) { - this(count, cardinality, null, null, null, null, topHits); + this(count, cardinality, null, null, null, null, null, null, topHits); + } + + public FieldStats(long count, int cardinality, String earliestTimestamp, String latestTimestamp, List> topHits) { + this(count, cardinality, null, null, null, null, earliestTimestamp, latestTimestamp, topHits); } public FieldStats(long count, int cardinality, Double minValue, Double maxValue, Double meanValue, Double medianValue, List> topHits) { + this(count, cardinality, minValue, maxValue, meanValue, medianValue, null, null, topHits); + } + + FieldStats(long count, int cardinality, Double minValue, Double maxValue, Double meanValue, Double medianValue, + String earliestTimestamp, String latestTimestamp, List> topHits) { this.count = count; this.cardinality = cardinality; this.minValue = minValue; this.maxValue = maxValue; this.meanValue = meanValue; this.medianValue = medianValue; + this.earliestTimestamp = earliestTimestamp; + this.latestTimestamp = latestTimestamp; this.topHits = (topHits == null) ? Collections.emptyList() : Collections.unmodifiableList(topHits); } @@ -74,6 +93,13 @@ public FieldStats(StreamInput in) throws IOException { maxValue = in.readOptionalDouble(); meanValue = in.readOptionalDouble(); medianValue = in.readOptionalDouble(); + if (in.getVersion().onOrAfter(Version.V_7_3_0)) { + earliestTimestamp = in.readOptionalString(); + latestTimestamp = in.readOptionalString(); + } else { + earliestTimestamp = null; + latestTimestamp = null; + } topHits = in.readList(StreamInput::readMap); } @@ -85,6 +111,10 @@ public void writeTo(StreamOutput out) throws IOException { out.writeOptionalDouble(maxValue); out.writeOptionalDouble(meanValue); out.writeOptionalDouble(medianValue); + if (out.getVersion().onOrAfter(Version.V_7_3_0)) { + out.writeOptionalString(earliestTimestamp); + out.writeOptionalString(latestTimestamp); + } out.writeCollection(topHits, StreamOutput::writeMap); } @@ -112,6 +142,14 @@ public Double getMedianValue() { return medianValue; } + public String getEarliestTimestamp() { + return earliestTimestamp; + } + + public String getLatestTimestamp() { + return latestTimestamp; + } + public List> getTopHits() { return topHits; } @@ -134,6 +172,12 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws if (medianValue != null) { builder.field(MEDIAN_VALUE.getPreferredName(), toIntegerIfInteger(medianValue)); } + if (earliestTimestamp != null) { + builder.field(EARLIEST.getPreferredName(), earliestTimestamp); + } + if (latestTimestamp != null) { + builder.field(LATEST.getPreferredName(), latestTimestamp); + } if (topHits.isEmpty() == false) { builder.field(TOP_HITS.getPreferredName(), topHits); } @@ -154,7 +198,7 @@ public static Number toIntegerIfInteger(double d) { @Override public int hashCode() { - return Objects.hash(count, cardinality, minValue, maxValue, meanValue, medianValue, topHits); + return Objects.hash(count, cardinality, minValue, maxValue, meanValue, medianValue, earliestTimestamp, latestTimestamp, topHits); } @Override @@ -175,6 +219,13 @@ public boolean equals(Object other) { Objects.equals(this.maxValue, that.maxValue) && Objects.equals(this.meanValue, that.meanValue) && Objects.equals(this.medianValue, that.medianValue) && + Objects.equals(this.earliestTimestamp, that.earliestTimestamp) && + Objects.equals(this.latestTimestamp, that.latestTimestamp) && Objects.equals(this.topHits, that.topHits); } + + @Override + public String toString() { + return Strings.toString(this); + } } diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStatsTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStatsTests.java index ec46d25edd409..889eae19387fc 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStatsTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStatsTests.java @@ -30,6 +30,8 @@ static FieldStats createTestFieldStats() { Double maxValue = null; Double meanValue = null; Double medianValue = null; + String earliestTimestamp = null; + String latestTimestamp = null; boolean isMetric = randomBoolean(); if (isMetric) { if (randomBoolean()) { @@ -41,6 +43,12 @@ static FieldStats createTestFieldStats() { } meanValue = randomDouble(); medianValue = randomDouble(); + } else { + boolean isDate = randomBoolean(); + if (isDate) { + earliestTimestamp = randomAlphaOfLength(20); + latestTimestamp = randomAlphaOfLength(20); + } } List> topHits = new ArrayList<>(); @@ -55,7 +63,7 @@ static FieldStats createTestFieldStats() { topHits.add(topHit); } - return new FieldStats(count, cardinality, minValue, maxValue, meanValue, medianValue, topHits); + return new FieldStats(count, cardinality, minValue, maxValue, meanValue, medianValue, earliestTimestamp, latestTimestamp, topHits); } @Override diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java index aa88905962638..91167fb12cfc9 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java @@ -159,8 +159,7 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List mappings = mappingsAndFieldStats.v1(); if (timeField != null) { - mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, - Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); + mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT); } if (mappingsAndFieldStats.v2() != null) { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculator.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculator.java index 40dff9116d7ab..e970b49f538d3 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculator.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculator.java @@ -5,10 +5,15 @@ */ package org.elasticsearch.xpack.ml.filestructurefinder; +import org.elasticsearch.common.time.DateFormatter; +import org.elasticsearch.common.time.DateFormatters; +import org.elasticsearch.index.mapper.DateFieldMapper; import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; +import java.time.Instant; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.Comparator; import java.util.LinkedHashMap; import java.util.List; @@ -26,8 +31,51 @@ public class FieldStatsCalculator { private long count; - private SortedMap countsByStringValue = new TreeMap<>(); - private SortedMap countsByNumericValue = new TreeMap<>(); + private SortedMap countsByStringValue; + private SortedMap countsByNumericValue; + private DateFormatter dateFormatter; + /** + * Parsed earliest and latest times. Some date formats may cause these to be + * wrong due to lack of information. For example, if the date format does not + * contain a year then these will be in 1970, and if there's no timezone in + * the format then these will be on the assumption the time was in UTC. However, + * since all the timestamps will be inaccurate in the same way the determination + * of the earliest and latest will still be correct. The trick then is to never + * print them out... + */ + private Instant earliestTimestamp; + private Instant latestTimestamp; + /** + * Earliest and latest times in the exact form they were present in the input, + * making the output immune to issues like not knowing the correct timezone + * or year when parsing. + */ + private String earliestTimeString; + private String latestTimeString; + + @SuppressWarnings("fallthrough") + public FieldStatsCalculator(Map mapping) { + + switch (mapping.get(FileStructureUtils.MAPPING_TYPE_SETTING)) { + case "byte": + case "short": + case "integer": + case "long": + case "half_float": + case "float": + case "double": + countsByNumericValue = new TreeMap<>(); + break; + case "date": + case "date_nanos": + String format = mapping.get(FileStructureUtils.MAPPING_FORMAT_SETTING); + dateFormatter = (format == null) ? DateFieldMapper.DEFAULT_DATE_TIME_FORMATTER : DateFormatter.forPattern(format); + // $FALL-THROUGH$ + default: + countsByStringValue = new TreeMap<>(); + break; + } + } /** * Add a collection of values to the calculator. @@ -41,14 +89,27 @@ public void accept(Collection fieldValues) { for (String fieldValue : fieldValues) { - countsByStringValue.compute(fieldValue, (k, v) -> (v == null) ? 1 : (1 + v)); - if (countsByNumericValue != null) { - try { countsByNumericValue.compute(Double.valueOf(fieldValue), (k, v) -> (v == null) ? 1 : (1 + v)); } catch (NumberFormatException e) { - countsByNumericValue = null; + // This should not happen in the usual context this class is used in within the file structure finder, + // as "double" should be big enough to hold any value that the file structure finder considers numeric + throw new IllegalArgumentException("Field with numeric mapping [" + fieldValue + "] could not be parsed as type double", + e); + } + } else { + countsByStringValue.compute(fieldValue, (k, v) -> (v == null) ? 1 : (1 + v)); + if (dateFormatter != null) { + Instant parsedTimestamp = DateFormatters.from(dateFormatter.parse(fieldValue)).toInstant(); + if (earliestTimestamp == null || earliestTimestamp.isAfter(parsedTimestamp)) { + earliestTimestamp = parsedTimestamp; + earliestTimeString = fieldValue; + } + if (latestTimestamp == null || latestTimestamp.isBefore(parsedTimestamp)) { + latestTimestamp = parsedTimestamp; + latestTimeString = fieldValue; + } } } } @@ -61,11 +122,17 @@ public void accept(Collection fieldValues) { */ public FieldStats calculate(int numTopHits) { - if (countsByNumericValue != null && countsByNumericValue.isEmpty() == false) { - return new FieldStats(count, countsByNumericValue.size(), countsByNumericValue.firstKey(), countsByNumericValue.lastKey(), - calculateMean(), calculateMedian(), findNumericTopHits(numTopHits)); + if (countsByNumericValue != null) { + if (countsByNumericValue.isEmpty()) { + assert count == 0; + return new FieldStats(count, 0, Collections.emptyList()); + } else { + assert count > 0; + return new FieldStats(count, countsByNumericValue.size(), countsByNumericValue.firstKey(), countsByNumericValue.lastKey(), + calculateMean(), calculateMedian(), findNumericTopHits(numTopHits)); + } } else { - return new FieldStats(count, countsByStringValue.size(), findStringTopHits(numTopHits)); + return new FieldStats(count, countsByStringValue.size(), earliestTimeString, latestTimeString, findStringTopHits(numTopHits)); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java index 90cc74c8d259c..e4945d3709860 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java @@ -29,6 +29,8 @@ public final class FileStructureUtils { public static final String MAPPING_TYPE_SETTING = "type"; public static final String MAPPING_FORMAT_SETTING = "format"; public static final String MAPPING_PROPERTIES_SETTING = "properties"; + public static final Map DATE_MAPPING_WITHOUT_FORMAT = + Collections.singletonMap(MAPPING_TYPE_SETTING, "date"); private static final int NUM_TOP_HITS = 10; // NUMBER Grok pattern doesn't support scientific notation, so we extend it @@ -231,7 +233,7 @@ static Tuple, FieldStats> guessMappingAndCalculateFieldStats Collection fieldValuesAsStrings = fieldValues.stream().map(Object::toString).collect(Collectors.toList()); Map mapping = guessScalarMapping(explanation, fieldName, fieldValuesAsStrings, timeoutChecker); timeoutChecker.check("mapping determination"); - return new Tuple<>(mapping, calculateFieldStats(fieldValuesAsStrings, timeoutChecker)); + return new Tuple<>(mapping, calculateFieldStats(mapping, fieldValuesAsStrings, timeoutChecker)); } private static Stream flatten(Object value) { @@ -323,13 +325,14 @@ else if (fieldValues.stream().allMatch(IP_GROK::match)) { /** * Calculate stats for a set of field values. + * @param mapping The mapping for the field. * @param fieldValues Values of the field for which field stats are to be calculated. * @param timeoutChecker Will abort the operation if its timeout is exceeded. * @return The stats calculated from the field values. */ - static FieldStats calculateFieldStats(Collection fieldValues, TimeoutChecker timeoutChecker) { + static FieldStats calculateFieldStats(Map mapping, Collection fieldValues, TimeoutChecker timeoutChecker) { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(mapping); calculator.accept(fieldValues); timeoutChecker.check("field stats calculation"); return calculator.calculate(NUM_TOP_HITS); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java index 7a5c9a48f8757..cb2bfeb00f268 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java @@ -195,14 +195,16 @@ public void validateFullLineGrokPattern(String grokPattern, String timestampFiel /** * Build a Grok pattern that will match all of the sample messages in their entirety. * @param seedPatternName A pattern that has already been determined to match some portion of every sample message. - * @param seedFieldName The field name to be used for the portion of every sample message that the seed pattern matches. + * @param seedMapping The mapping for the seed field. + * @param seedFieldName The field name to be used for the portion of every sample message that the seed pattern matches. * @return The built Grok pattern. */ - public String createGrokPatternFromExamples(String seedPatternName, String seedFieldName) { + public String createGrokPatternFromExamples(String seedPatternName, Map seedMapping, String seedFieldName) { overallGrokPatternBuilder.setLength(0); - GrokPatternCandidate seedCandidate = new NoMappingGrokPatternCandidate(seedPatternName, seedFieldName, grokPatternDefinitions); + GrokPatternCandidate seedCandidate = new PrecalculatedMappingGrokPatternCandidate(seedPatternName, seedMapping, seedFieldName, + grokPatternDefinitions); processCandidateAndSplit(seedCandidate, true, sampleMessages, false, 0, false, 0); @@ -433,7 +435,7 @@ String processCaptures(List explanation, Map fieldNameC static class ValueOnlyGrokPatternCandidate implements GrokPatternCandidate { private final String grokPatternName; - private final String mappingType; + private final Map mapping; private final String fieldName; private final Grok grok; @@ -451,7 +453,8 @@ static class ValueOnlyGrokPatternCandidate implements GrokPatternCandidate { * @param fieldName Name of the field to extract from the match. */ ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName) { - this(grokPatternName, mappingType, fieldName, "\\b", "\\b", Grok.getBuiltinPatterns()); + this(grokPatternName, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, mappingType), fieldName, + "\\b", "\\b", Grok.getBuiltinPatterns()); } /** @@ -462,7 +465,8 @@ static class ValueOnlyGrokPatternCandidate implements GrokPatternCandidate { */ ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName, Map grokPatternDefinitions) { - this(grokPatternName, mappingType, fieldName, "\\b", "\\b", grokPatternDefinitions); + this(grokPatternName, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, mappingType), fieldName, + "\\b", "\\b", grokPatternDefinitions); } /** @@ -473,25 +477,28 @@ static class ValueOnlyGrokPatternCandidate implements GrokPatternCandidate { * @param postBreak Only consider the match if it's broken from the following text by this. */ ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName, String preBreak, String postBreak) { - this(grokPatternName, mappingType, fieldName, preBreak, postBreak, Grok.getBuiltinPatterns()); + this(grokPatternName, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, mappingType), fieldName, + preBreak, postBreak, Grok.getBuiltinPatterns()); } /** * @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash. - * @param mappingType Data type for field in Elasticsearch mappings. + * @param mapping Elasticsearch mapping for the field. * @param fieldName Name of the field to extract from the match. * @param preBreak Only consider the match if it's broken from the previous text by this. * @param postBreak Only consider the match if it's broken from the following text by this. * @param grokPatternDefinitions Definitions of Grok patterns to be used. */ - ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName, String preBreak, String postBreak, - Map grokPatternDefinitions) { - this.grokPatternName = grokPatternName; - this.mappingType = mappingType; - this.fieldName = fieldName; + ValueOnlyGrokPatternCandidate(String grokPatternName, Map mapping, String fieldName, String preBreak, + String postBreak, Map grokPatternDefinitions) { + this.grokPatternName = Objects.requireNonNull(grokPatternName); + this.mapping = Collections.unmodifiableMap(mapping); + this.fieldName = Objects.requireNonNull(fieldName); // The (?m) here has the Ruby meaning, which is equivalent to (?s) in Java - grok = new Grok(grokPatternDefinitions, "(?m)%{DATA:" + PREFACE + "}" + preBreak + - "%{" + grokPatternName + ":" + VALUE + "}" + postBreak + "%{GREEDYDATA:" + EPILOGUE + "}", TimeoutChecker.watchdog); + grok = new Grok(grokPatternDefinitions, + "(?m)%{DATA:" + PREFACE + "}" + Objects.requireNonNull(preBreak) + + "%{" + grokPatternName + ":" + VALUE + "}" + Objects.requireNonNull(postBreak) + "%{GREEDYDATA:" + EPILOGUE + "}", + TimeoutChecker.watchdog); } @Override @@ -520,23 +527,24 @@ public String processCaptures(List explanation, Map fie epilogues.add(captures.getOrDefault(EPILOGUE, "").toString()); } String adjustedFieldName = buildFieldName(fieldNameCountStore, fieldName); - if (mappings != null) { - Map fullMappingType = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, mappingType); - if ("date".equals(mappingType)) { - try { - fullMappingType = FileStructureUtils.findTimestampMapping(explanation, values, timeoutChecker); - } catch (IllegalArgumentException e) { - // This feels like it shouldn't happen, but there may be some obscure edge case - // where it does, and in production it will cause less frustration to just return - // a mapping type of "date" with no format than to fail the whole analysis - assert e == null : e.getMessage(); - } - timeoutChecker.check("mapping determination"); + Map adjustedMapping = mapping; + // If the mapping is type "date" with no format, try to adjust it to include the format + if (FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT.equals(adjustedMapping)) { + try { + adjustedMapping = FileStructureUtils.findTimestampMapping(explanation, values, timeoutChecker); + } catch (IllegalArgumentException e) { + // This feels like it shouldn't happen, but there may be some obscure edge case + // where it does, and in production it will cause less frustration to just return + // a mapping type of "date" with no format than to fail the whole analysis + assert e == null : e.getMessage(); } - mappings.put(adjustedFieldName, fullMappingType); + timeoutChecker.check("mapping determination"); + } + if (mappings != null) { + mappings.put(adjustedFieldName, adjustedMapping); } if (fieldStats != null) { - fieldStats.put(adjustedFieldName, FileStructureUtils.calculateFieldStats(values, timeoutChecker)); + fieldStats.put(adjustedFieldName, FileStructureUtils.calculateFieldStats(adjustedMapping, values, timeoutChecker)); } return "%{" + grokPatternName + ":" + adjustedFieldName + "}"; } @@ -598,13 +606,13 @@ public String processCaptures(List explanation, Map fie timeoutChecker.check("full message Grok pattern field extraction"); } String adjustedFieldName = buildFieldName(fieldNameCountStore, fieldName); + Map mapping = FileStructureUtils.guessScalarMapping(explanation, adjustedFieldName, values, timeoutChecker); + timeoutChecker.check("mapping determination"); if (mappings != null) { - mappings.put(adjustedFieldName, - FileStructureUtils.guessScalarMapping(explanation, adjustedFieldName, values, timeoutChecker)); - timeoutChecker.check("mapping determination"); + mappings.put(adjustedFieldName, mapping); } if (fieldStats != null) { - fieldStats.put(adjustedFieldName, FileStructureUtils.calculateFieldStats(values, timeoutChecker)); + fieldStats.put(adjustedFieldName, FileStructureUtils.calculateFieldStats(mapping, values, timeoutChecker)); } return "\\b" + fieldName + "=%{USER:" + adjustedFieldName + "}"; } @@ -613,10 +621,11 @@ public String processCaptures(List explanation, Map fie /** * A Grok pattern candidate that matches a single named Grok pattern but will not update mappings. */ - static class NoMappingGrokPatternCandidate extends ValueOnlyGrokPatternCandidate { + static class PrecalculatedMappingGrokPatternCandidate extends ValueOnlyGrokPatternCandidate { - NoMappingGrokPatternCandidate(String grokPatternName, String fieldName, Map grokPatternDefinitions) { - super(grokPatternName, null, fieldName, grokPatternDefinitions); + PrecalculatedMappingGrokPatternCandidate(String grokPatternName, Map mapping, String fieldName, + Map grokPatternDefinitions) { + super(grokPatternName, mapping, fieldName, "\\b", "\\b", grokPatternDefinitions); } @Override @@ -710,16 +719,16 @@ public Tuple processMatch(List explanation, Collection> valuesForField : valuesPerField.entrySet()) { String fieldName = valuesForField.getKey(); - if (mappings != null) { - // Exclude the time field because that will be dropped and replaced with @timestamp - if (fieldName.equals(timeField) == false) { - mappings.put(fieldName, - FileStructureUtils.guessScalarMapping(explanation, fieldName, valuesForField.getValue(), timeoutChecker)); - timeoutChecker.check("mapping determination"); - } + Map mapping = + FileStructureUtils.guessScalarMapping(explanation, fieldName, valuesForField.getValue(), timeoutChecker); + timeoutChecker.check("mapping determination"); + // Exclude the time field because that will be dropped and replaced with @timestamp + if (mappings != null && fieldName.equals(timeField) == false) { + mappings.put(fieldName, mapping); } if (fieldStats != null) { - fieldStats.put(fieldName, FileStructureUtils.calculateFieldStats(valuesForField.getValue(), timeoutChecker)); + fieldStats.put(fieldName, + FileStructureUtils.calculateFieldStats(mapping, valuesForField.getValue(), timeoutChecker)); } } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinder.java index 116de8f7679d2..1b405eb685fa2 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinder.java @@ -70,8 +70,7 @@ static NdJsonFileStructureFinder makeNdJsonFileStructureFinder(List expl SortedMap mappings = mappingsAndFieldStats.v1(); if (timeField != null) { - mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, - Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); + mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT); } if (mappingsAndFieldStats.v2() != null) { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java index 86b1d79b8b66b..e47d045dd257e 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java @@ -108,12 +108,13 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List ex .setNumMessagesAnalyzed(sampleMessages.size()) .setMultilineStartPattern(multiLineRegex); + Map messageMapping = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "text"); SortedMap mappings = new TreeMap<>(); - mappings.put("message", Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "text")); - mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); + mappings.put("message", messageMapping); + mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT); SortedMap fieldStats = new TreeMap<>(); - fieldStats.put("message", FileStructureUtils.calculateFieldStats(sampleMessages, timeoutChecker)); + fieldStats.put("message", FileStructureUtils.calculateFieldStats(messageMapping, sampleMessages, timeoutChecker)); Map customGrokPatternDefinitions = timestampFormatFinder.getCustomGrokPatternDefinitions(); GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, fieldStats, @@ -136,8 +137,8 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List ex if (interimTimestampField == null) { interimTimestampField = "timestamp"; } - grokPattern = - grokPatternCreator.createGrokPatternFromExamples(timestampFormatFinder.getGrokPatternName(), interimTimestampField); + grokPattern = grokPatternCreator.createGrokPatternFromExamples(timestampFormatFinder.getGrokPatternName(), + timestampFormatFinder.getEsDateMappingTypeWithFormat(), interimTimestampField); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java index d2572b7fd2085..91fc61bcbd4b4 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java @@ -120,8 +120,7 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List explanatio SortedMap outerMappings = new TreeMap<>(); outerMappings.put(topLevelTag, secondLevelProperties); if (timeField != null) { - outerMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, - Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); + outerMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT); } FileStructure structure = structureBuilder diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculatorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculatorTests.java index 30445a4a77c10..4efaf64bd092c 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculatorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculatorTests.java @@ -10,14 +10,19 @@ import java.util.Arrays; import java.util.Collections; import java.util.DoubleSummaryStatistics; +import java.util.HashMap; import java.util.List; import java.util.Map; public class FieldStatsCalculatorTests extends FileStructureTestCase { + private static final Map LONG = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"); + private static final Map DOUBLE = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "double"); + private static final Map KEYWORD = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"); + public void testMean() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(DOUBLE); calculator.accept(Arrays.asList("1", "3.5", "2.5", "9")); @@ -26,7 +31,7 @@ public void testMean() { public void testMedianGivenOddCount() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(LONG); calculator.accept(Arrays.asList("3", "23", "-1", "5", "1000")); @@ -35,7 +40,7 @@ public void testMedianGivenOddCount() { public void testMedianGivenOddCountMinimal() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(LONG); calculator.accept(Collections.singletonList("3")); @@ -44,7 +49,7 @@ public void testMedianGivenOddCountMinimal() { public void testMedianGivenEvenCountMiddleValuesDifferent() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(LONG); calculator.accept(Arrays.asList("3", "23", "-1", "5", "1000", "6")); @@ -53,7 +58,7 @@ public void testMedianGivenEvenCountMiddleValuesDifferent() { public void testMedianGivenEvenCountMiddleValuesSame() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(LONG); calculator.accept(Arrays.asList("3", "23", "-1", "5", "1000", "5")); @@ -62,7 +67,7 @@ public void testMedianGivenEvenCountMiddleValuesSame() { public void testMedianGivenEvenCountMinimal() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(LONG); calculator.accept(Arrays.asList("4", "4")); @@ -71,7 +76,7 @@ public void testMedianGivenEvenCountMinimal() { public void testTopHitsNumeric() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(DOUBLE); calculator.accept(Arrays.asList("4", "4", "7", "4", "6", "5.2", "6", "5.2", "16", "4", "5.2")); @@ -88,7 +93,7 @@ public void testTopHitsNumeric() { public void testTopHitsString() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(KEYWORD); calculator.accept(Arrays.asList("s", "s", "d", "s", "f", "x", "f", "x", "n", "s", "x")); @@ -105,7 +110,8 @@ public void testTopHitsString() { public void testCalculateGivenEmpty() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = + new FieldStatsCalculator(randomFrom(Arrays.asList(LONG, DOUBLE, KEYWORD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT))); calculator.accept(Collections.emptyList()); @@ -117,12 +123,15 @@ public void testCalculateGivenEmpty() { assertNull(stats.getMaxValue()); assertNull(stats.getMeanValue()); assertNull(stats.getMedianValue()); + assertNull(stats.getEarliestTimestamp()); + assertNull(stats.getLatestTimestamp()); + assertEquals(0, stats.getTopHits().size()); } public void testCalculateGivenNumericField() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(DOUBLE); calculator.accept(Arrays.asList("4.5", "4.5", "7", "4.5", "6", "5", "6", "5", "25", "4.5", "5")); @@ -134,6 +143,8 @@ public void testCalculateGivenNumericField() { assertEquals(25.0, stats.getMaxValue(), 1e-10); assertEquals(7.0, stats.getMeanValue(), 1e-10); assertEquals(5.0, stats.getMedianValue(), 1e-10); + assertNull(stats.getEarliestTimestamp()); + assertNull(stats.getLatestTimestamp()); List> topHits = stats.getTopHits(); @@ -148,7 +159,7 @@ public void testCalculateGivenNumericField() { public void testCalculateGivenStringField() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(KEYWORD); calculator.accept(Arrays.asList("s", "s", "d", "s", "f", "x", "f", "x", "n", "s", "x")); @@ -160,6 +171,8 @@ public void testCalculateGivenStringField() { assertNull(stats.getMaxValue()); assertNull(stats.getMeanValue()); assertNull(stats.getMedianValue()); + assertNull(stats.getEarliestTimestamp()); + assertNull(stats.getLatestTimestamp()); List> topHits = stats.getTopHits(); @@ -174,7 +187,7 @@ public void testCalculateGivenStringField() { public void testCalculateGivenMixedField() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(KEYWORD); calculator.accept(Arrays.asList("4", "4", "d", "4", "f", "x", "f", "x", "16", "4", "x")); @@ -186,6 +199,8 @@ public void testCalculateGivenMixedField() { assertNull(stats.getMaxValue()); assertNull(stats.getMeanValue()); assertNull(stats.getMedianValue()); + assertNull(stats.getEarliestTimestamp()); + assertNull(stats.getLatestTimestamp()); List> topHits = stats.getTopHits(); @@ -198,10 +213,71 @@ public void testCalculateGivenMixedField() { assertEquals(2, topHits.get(2).get("count")); } + public void testGivenDateFieldWithoutFormat() { + + FieldStatsCalculator calculator = new FieldStatsCalculator(FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT); + + calculator.accept(Arrays.asList("2018-10-08T10:49:16.642", "2018-10-08T10:49:16.642", "2018-10-08T10:49:16.642", + "2018-09-08T11:12:13.789", "2019-01-28T01:02:03.456", "2018-09-08T11:12:13.789")); + + FieldStats stats = calculator.calculate(3); + + assertEquals(6L, stats.getCount()); + assertEquals(3, stats.getCardinality()); + assertNull(stats.getMinValue()); + assertNull(stats.getMaxValue()); + assertNull(stats.getMeanValue()); + assertNull(stats.getMedianValue()); + assertEquals("2018-09-08T11:12:13.789", stats.getEarliestTimestamp()); + assertEquals("2019-01-28T01:02:03.456", stats.getLatestTimestamp()); + + List> topHits = stats.getTopHits(); + + assertEquals(3, topHits.size()); + assertEquals("2018-10-08T10:49:16.642", topHits.get(0).get("value")); + assertEquals(3, topHits.get(0).get("count")); + assertEquals("2018-09-08T11:12:13.789", topHits.get(1).get("value")); + assertEquals(2, topHits.get(1).get("count")); + assertEquals("2019-01-28T01:02:03.456", topHits.get(2).get("value")); + assertEquals(1, topHits.get(2).get("count")); + } + + public void testGivenDateFieldWithFormat() { + + Map dateMapping = new HashMap<>(); + dateMapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date"); + dateMapping.put(FileStructureUtils.MAPPING_FORMAT_SETTING, "M/dd/yyyy h:mma"); + FieldStatsCalculator calculator = new FieldStatsCalculator(dateMapping); + + calculator.accept(Arrays.asList("10/08/2018 10:49AM", "10/08/2018 10:49AM", "10/08/2018 10:49AM", + "9/08/2018 11:12AM", "1/28/2019 1:02AM", "9/08/2018 11:12AM")); + + FieldStats stats = calculator.calculate(3); + + assertEquals(6L, stats.getCount()); + assertEquals(3, stats.getCardinality()); + assertNull(stats.getMinValue()); + assertNull(stats.getMaxValue()); + assertNull(stats.getMeanValue()); + assertNull(stats.getMedianValue()); + assertEquals("9/08/2018 11:12AM", stats.getEarliestTimestamp()); + assertEquals("1/28/2019 1:02AM", stats.getLatestTimestamp()); + + List> topHits = stats.getTopHits(); + + assertEquals(3, topHits.size()); + assertEquals("10/08/2018 10:49AM", topHits.get(0).get("value")); + assertEquals(3, topHits.get(0).get("count")); + assertEquals("9/08/2018 11:12AM", topHits.get(1).get("value")); + assertEquals(2, topHits.get(1).get("count")); + assertEquals("1/28/2019 1:02AM", topHits.get(2).get("value")); + assertEquals(1, topHits.get(2).get("count")); + } + public void testJavaStatsEquivalence() { DoubleSummaryStatistics summaryStatistics = new DoubleSummaryStatistics(); - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(DOUBLE); for (int numValues = randomIntBetween(1000, 10000); numValues > 0; --numValues) { diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java index c0adccd0eb477..91568573c9d71 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java @@ -323,8 +323,7 @@ public void testGuessMappingsAndCalculateFieldStats() { sample2.put("nothing", null); Tuple, SortedMap> mappingsAndFieldStats = - FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, Arrays.asList(sample1, sample2), - NOOP_TIMEOUT_CHECKER); + FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, Arrays.asList(sample1, sample2), NOOP_TIMEOUT_CHECKER); assertNotNull(mappingsAndFieldStats); Map mappings = mappingsAndFieldStats.v1(); @@ -341,7 +340,8 @@ public void testGuessMappingsAndCalculateFieldStats() { assertNotNull(fieldStats); assertEquals(3, fieldStats.size()); assertEquals(new FieldStats(2, 2, makeTopHits("not a time", 1, "whatever", 1)), fieldStats.get("foo")); - assertEquals(new FieldStats(2, 2, makeTopHits("2018-05-24 17:28:31,735", 1, "2018-05-29 11:53:02,837", 1)), fieldStats.get("time")); + assertEquals(new FieldStats(2, 2, "2018-05-24 17:28:31,735", "2018-05-29 11:53:02,837", + makeTopHits("2018-05-24 17:28:31,735", 1, "2018-05-29 11:53:02,837", 1)), fieldStats.get("time")); assertEquals(new FieldStats(2, 2, 17.0, 42.0, 29.5, 29.5, makeTopHits(17, 1, 42, 1)), fieldStats.get("bar")); assertNull(fieldStats.get("nothing")); } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java index 7e6363602dcdd..967c6d42921e4 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java @@ -203,7 +203,8 @@ public void testCreateGrokPatternFromExamplesGivenNamedLogs() { assertEquals("%{SYSLOGTIMESTAMP:timestamp} .*? .*?\\[%{INT:field}\\]: %{LOGLEVEL:loglevel} \\(.*? .*? .*?\\) .*? " + "%{QUOTEDSTRING:field2}: %{IP:ipaddress}#%{INT:field3}", - grokPatternCreator.createGrokPatternFromExamples("SYSLOGTIMESTAMP", "timestamp")); + grokPatternCreator.createGrokPatternFromExamples("SYSLOGTIMESTAMP", FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT, + "timestamp")); assertEquals(5, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel")); @@ -229,7 +230,8 @@ public void testCreateGrokPatternFromExamplesGivenCatalinaLogs() { NOOP_TIMEOUT_CHECKER); assertEquals("%{CATALINA_DATESTAMP:timestamp} .*? .*?\\n%{LOGLEVEL:loglevel}: .*", - grokPatternCreator.createGrokPatternFromExamples("CATALINA_DATESTAMP", "timestamp")); + grokPatternCreator.createGrokPatternFromExamples("CATALINA_DATESTAMP", FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT, + "timestamp")); assertEquals(1, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel")); } @@ -253,7 +255,8 @@ public void testCreateGrokPatternFromExamplesGivenMultiTimestampLogs() { assertEquals("%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{TIMESTAMP_ISO8601:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" + "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*", - grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", "timestamp")); + grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT, + "timestamp")); assertEquals(5, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); Map expectedDateMapping = new HashMap<>(); @@ -284,7 +287,8 @@ public void testCreateGrokPatternFromExamplesGivenMultiTimestampLogsAndIndetermi assertEquals("%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{DATESTAMP:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" + "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*", - grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", "timestamp")); + grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT, + "timestamp")); assertEquals(5, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); Map expectedDateMapping = new HashMap<>(); @@ -314,9 +318,12 @@ public void testCreateGrokPatternFromExamplesGivenMultiTimestampLogsAndCustomDef Collections.singletonMap("CUSTOM_TIMESTAMP", "%{MONTHNUM}/%{MONTHDAY}/%{YEAR} %{HOUR}:%{MINUTE}(?:AM|PM)"), NOOP_TIMEOUT_CHECKER); + Map customMapping = new HashMap<>(); + customMapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date"); + customMapping.put(FileStructureUtils.MAPPING_FORMAT_SETTING, "M/dd/yyyy h:mma"); assertEquals("%{INT:field}\\t%{CUSTOM_TIMESTAMP:timestamp}\\t%{TIMESTAMP_ISO8601:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" + "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*", - grokPatternCreator.createGrokPatternFromExamples("CUSTOM_TIMESTAMP", "timestamp")); + grokPatternCreator.createGrokPatternFromExamples("CUSTOM_TIMESTAMP", customMapping, "timestamp")); assertEquals(5, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); Map expectedDateMapping = new HashMap<>(); @@ -347,7 +354,8 @@ public void testCreateGrokPatternFromExamplesGivenTimestampAndTimeWithoutDate() assertEquals("%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{TIME:time}\\t%{INT:field2}\\t.*?\\t" + "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*", - grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", "timestamp")); + grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT, + "timestamp")); assertEquals(5, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("time")); From 1f4b2d5545afc14164cf4b817e27320696f90ffd Mon Sep 17 00:00:00 2001 From: David Roberts Date: Wed, 5 Jun 2019 13:54:41 +0100 Subject: [PATCH 2/3] Fixing YAML test --- .../resources/rest-api-spec/test/ml/find_file_structure.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml index a9634605aaac2..cf175961b1ef9 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml @@ -56,7 +56,8 @@ setup: - match: { field_stats.sourcetype.cardinality: 1 } - match: { field_stats.time.count: 3 } - match: { field_stats.time.cardinality: 3 } - - match: { field_stats.time.cardinality: 3 } + - match: { field_stats.time.earliest: "1403481600" } + - match: { field_stats.time.latest: "1403481800" } - is_false: explanation --- @@ -116,5 +117,6 @@ setup: - match: { field_stats.sourcetype.cardinality: 1 } - match: { field_stats.time.count: 3 } - match: { field_stats.time.cardinality: 3 } - - match: { field_stats.time.cardinality: 3 } + - match: { field_stats.time.earliest: "1403481600" } + - match: { field_stats.time.latest: "1403481800" } - match: { explanation.0: "Using specified character encoding [UTF-8]" } From fc0264b42eb26c1d222998893f4b50fc12407d0f Mon Sep 17 00:00:00 2001 From: David Roberts Date: Wed, 5 Jun 2019 15:40:12 +0100 Subject: [PATCH 3/3] Address review feedback --- .../xpack/ml/filestructurefinder/FieldStatsCalculator.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculator.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculator.java index e970b49f538d3..39bf613165e2e 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculator.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculator.java @@ -53,7 +53,6 @@ public class FieldStatsCalculator { private String earliestTimeString; private String latestTimeString; - @SuppressWarnings("fallthrough") public FieldStatsCalculator(Map mapping) { switch (mapping.get(FileStructureUtils.MAPPING_TYPE_SETTING)) { @@ -70,7 +69,9 @@ public FieldStatsCalculator(Map mapping) { case "date_nanos": String format = mapping.get(FileStructureUtils.MAPPING_FORMAT_SETTING); dateFormatter = (format == null) ? DateFieldMapper.DEFAULT_DATE_TIME_FORMATTER : DateFormatter.forPattern(format); - // $FALL-THROUGH$ + // Dates are treated like strings for top hits + countsByStringValue = new TreeMap<>(); + break; default: countsByStringValue = new TreeMap<>(); break;