diff --git a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/filestructurefinder/FieldStats.java b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/filestructurefinder/FieldStats.java index 4391d03f6d940..adb8e68393e82 100644 --- a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/filestructurefinder/FieldStats.java +++ b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/filestructurefinder/FieldStats.java @@ -38,12 +38,14 @@ public class FieldStats implements ToXContentObject { public static final ParseField MAX_VALUE = new ParseField("max_value"); public static final ParseField MEAN_VALUE = new ParseField("mean_value"); public static final ParseField MEDIAN_VALUE = new ParseField("median_value"); + public static final ParseField EARLIEST = new ParseField("earliest"); + public static final ParseField LATEST = new ParseField("latest"); public static final ParseField TOP_HITS = new ParseField("top_hits"); @SuppressWarnings("unchecked") public static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>("field_stats", true, a -> new FieldStats((long) a[0], (int) a[1], (Double) a[2], (Double) a[3], (Double) a[4], (Double) a[5], - (List>) a[6])); + (String) a[6], (String) a[7], (List>) a[8])); static { PARSER.declareLong(ConstructingObjectParser.constructorArg(), COUNT); @@ -52,6 +54,8 @@ public class FieldStats implements ToXContentObject { PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MAX_VALUE); PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MEAN_VALUE); PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MEDIAN_VALUE); + PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), EARLIEST); + PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), LATEST); PARSER.declareObjectArray(ConstructingObjectParser.optionalConstructorArg(), (p, c) -> p.mapOrdered(), TOP_HITS); } @@ -61,16 +65,20 @@ public class FieldStats implements ToXContentObject { private final Double maxValue; private final Double meanValue; private final Double medianValue; + private final String earliestTimestamp; + private final String latestTimestamp; private final List> topHits; FieldStats(long count, int cardinality, Double minValue, Double maxValue, Double meanValue, Double medianValue, - List> topHits) { + String earliestTimestamp, String latestTimestamp, List> topHits) { this.count = count; this.cardinality = cardinality; this.minValue = minValue; this.maxValue = maxValue; this.meanValue = meanValue; this.medianValue = medianValue; + this.earliestTimestamp = earliestTimestamp; + this.latestTimestamp = latestTimestamp; this.topHits = (topHits == null) ? Collections.emptyList() : Collections.unmodifiableList(topHits); } @@ -98,6 +106,14 @@ public Double getMedianValue() { return medianValue; } + public String getEarliestTimestamp() { + return earliestTimestamp; + } + + public String getLatestTimestamp() { + return latestTimestamp; + } + public List> getTopHits() { return topHits; } @@ -120,6 +136,12 @@ public XContentBuilder toXContent(XContentBuilder builder, ToXContent.Params par if (medianValue != null) { builder.field(MEDIAN_VALUE.getPreferredName(), toIntegerIfInteger(medianValue)); } + if (earliestTimestamp != null) { + builder.field(EARLIEST.getPreferredName(), earliestTimestamp); + } + if (latestTimestamp != null) { + builder.field(LATEST.getPreferredName(), latestTimestamp); + } if (topHits.isEmpty() == false) { builder.field(TOP_HITS.getPreferredName(), topHits); } @@ -140,7 +162,7 @@ static Number toIntegerIfInteger(double d) { @Override public int hashCode() { - return Objects.hash(count, cardinality, minValue, maxValue, meanValue, medianValue, topHits); + return Objects.hash(count, cardinality, minValue, maxValue, meanValue, medianValue, earliestTimestamp, latestTimestamp, topHits); } @Override @@ -161,6 +183,8 @@ public boolean equals(Object other) { Objects.equals(this.maxValue, that.maxValue) && Objects.equals(this.meanValue, that.meanValue) && Objects.equals(this.medianValue, that.medianValue) && + Objects.equals(this.earliestTimestamp, that.earliestTimestamp) && + Objects.equals(this.latestTimestamp, that.latestTimestamp) && Objects.equals(this.topHits, that.topHits); } } diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/filestructurefinder/FieldStatsTests.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/filestructurefinder/FieldStatsTests.java index daf6c4af90ddc..0d6e5e70290d9 100644 --- a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/filestructurefinder/FieldStatsTests.java +++ b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/filestructurefinder/FieldStatsTests.java @@ -43,6 +43,8 @@ static FieldStats createTestFieldStats() { Double maxValue = null; Double meanValue = null; Double medianValue = null; + String earliestTimestamp = null; + String latestTimestamp = null; boolean isMetric = randomBoolean(); if (isMetric) { if (randomBoolean()) { @@ -54,6 +56,12 @@ static FieldStats createTestFieldStats() { } meanValue = randomDouble(); medianValue = randomDouble(); + } else { + boolean isDate = randomBoolean(); + if (isDate) { + earliestTimestamp = randomAlphaOfLength(20); + latestTimestamp = randomAlphaOfLength(20); + } } List> topHits = new ArrayList<>(); @@ -68,7 +76,7 @@ static FieldStats createTestFieldStats() { topHits.add(topHit); } - return new FieldStats(count, cardinality, minValue, maxValue, meanValue, medianValue, topHits); + return new FieldStats(count, cardinality, minValue, maxValue, meanValue, medianValue, earliestTimestamp, latestTimestamp, topHits); } @Override diff --git a/docs/reference/ml/apis/find-file-structure.asciidoc b/docs/reference/ml/apis/find-file-structure.asciidoc index b0835ff9466bb..f3029635bf44e 100644 --- a/docs/reference/ml/apis/find-file-structure.asciidoc +++ b/docs/reference/ml/apis/find-file-structure.asciidoc @@ -445,6 +445,8 @@ If the request does not encounter errors, you receive the following result: "release_date" : { "count" : 24, "cardinality" : 20, + "earliest" : "1932-06-01", + "latest" : "2011-06-02", "top_hits" : [ { "value" : "1985-06-01", @@ -1152,6 +1154,8 @@ If the request does not encounter errors, you receive the following result: "tpep_dropoff_datetime" : { "count" : 19998, "cardinality" : 9066, + "earliest" : "2018-05-31 06:18:15", + "latest" : "2018-06-02 02:25:44", "top_hits" : [ { "value" : "2018-06-01 01:12:12", @@ -1198,6 +1202,8 @@ If the request does not encounter errors, you receive the following result: "tpep_pickup_datetime" : { "count" : 19998, "cardinality" : 8760, + "earliest" : "2018-05-31 06:08:31", + "latest" : "2018-06-02 01:21:21", "top_hits" : [ { "value" : "2018-06-01 00:01:23", @@ -1457,6 +1463,8 @@ this: "timestamp" : { "count" : 53, "cardinality" : 28, + "earliest" : "2018-09-27T14:39:28,518", + "latest" : "2018-09-27T14:39:37,012", "top_hits" : [ { "value" : "2018-09-27T14:39:29,859", @@ -1719,6 +1727,8 @@ this: "timestamp" : { "count" : 53, "cardinality" : 28, + "earliest" : "2018-09-27T14:39:28,518", + "latest" : "2018-09-27T14:39:37,012", "top_hits" : [ { "value" : "2018-09-27T14:39:29,859", diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStats.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStats.java index b371ca739bb6b..95abf16aa1444 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStats.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStats.java @@ -5,7 +5,9 @@ */ package org.elasticsearch.xpack.core.ml.filestructurefinder; +import org.elasticsearch.Version; import org.elasticsearch.common.ParseField; +import org.elasticsearch.common.Strings; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Writeable; @@ -27,12 +29,14 @@ public class FieldStats implements ToXContentObject, Writeable { static final ParseField MAX_VALUE = new ParseField("max_value"); static final ParseField MEAN_VALUE = new ParseField("mean_value"); static final ParseField MEDIAN_VALUE = new ParseField("median_value"); + static final ParseField EARLIEST = new ParseField("earliest"); + static final ParseField LATEST = new ParseField("latest"); static final ParseField TOP_HITS = new ParseField("top_hits"); @SuppressWarnings("unchecked") public static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>("field_stats", false, a -> new FieldStats((long) a[0], (int) a[1], (Double) a[2], (Double) a[3], (Double) a[4], (Double) a[5], - (List>) a[6])); + (String) a[6], (String) a[7], (List>) a[8])); static { PARSER.declareLong(ConstructingObjectParser.constructorArg(), COUNT); @@ -41,6 +45,8 @@ public class FieldStats implements ToXContentObject, Writeable { PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MAX_VALUE); PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MEAN_VALUE); PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MEDIAN_VALUE); + PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), EARLIEST); + PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), LATEST); PARSER.declareObjectArray(ConstructingObjectParser.optionalConstructorArg(), (p, c) -> p.mapOrdered(), TOP_HITS); } @@ -50,20 +56,33 @@ public class FieldStats implements ToXContentObject, Writeable { private final Double maxValue; private final Double meanValue; private final Double medianValue; + private final String earliestTimestamp; + private final String latestTimestamp; private final List> topHits; public FieldStats(long count, int cardinality, List> topHits) { - this(count, cardinality, null, null, null, null, topHits); + this(count, cardinality, null, null, null, null, null, null, topHits); + } + + public FieldStats(long count, int cardinality, String earliestTimestamp, String latestTimestamp, List> topHits) { + this(count, cardinality, null, null, null, null, earliestTimestamp, latestTimestamp, topHits); } public FieldStats(long count, int cardinality, Double minValue, Double maxValue, Double meanValue, Double medianValue, List> topHits) { + this(count, cardinality, minValue, maxValue, meanValue, medianValue, null, null, topHits); + } + + FieldStats(long count, int cardinality, Double minValue, Double maxValue, Double meanValue, Double medianValue, + String earliestTimestamp, String latestTimestamp, List> topHits) { this.count = count; this.cardinality = cardinality; this.minValue = minValue; this.maxValue = maxValue; this.meanValue = meanValue; this.medianValue = medianValue; + this.earliestTimestamp = earliestTimestamp; + this.latestTimestamp = latestTimestamp; this.topHits = (topHits == null) ? Collections.emptyList() : Collections.unmodifiableList(topHits); } @@ -74,6 +93,13 @@ public FieldStats(StreamInput in) throws IOException { maxValue = in.readOptionalDouble(); meanValue = in.readOptionalDouble(); medianValue = in.readOptionalDouble(); + if (in.getVersion().onOrAfter(Version.V_7_3_0)) { + earliestTimestamp = in.readOptionalString(); + latestTimestamp = in.readOptionalString(); + } else { + earliestTimestamp = null; + latestTimestamp = null; + } topHits = in.readList(StreamInput::readMap); } @@ -85,6 +111,10 @@ public void writeTo(StreamOutput out) throws IOException { out.writeOptionalDouble(maxValue); out.writeOptionalDouble(meanValue); out.writeOptionalDouble(medianValue); + if (out.getVersion().onOrAfter(Version.V_7_3_0)) { + out.writeOptionalString(earliestTimestamp); + out.writeOptionalString(latestTimestamp); + } out.writeCollection(topHits, StreamOutput::writeMap); } @@ -112,6 +142,14 @@ public Double getMedianValue() { return medianValue; } + public String getEarliestTimestamp() { + return earliestTimestamp; + } + + public String getLatestTimestamp() { + return latestTimestamp; + } + public List> getTopHits() { return topHits; } @@ -134,6 +172,12 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws if (medianValue != null) { builder.field(MEDIAN_VALUE.getPreferredName(), toIntegerIfInteger(medianValue)); } + if (earliestTimestamp != null) { + builder.field(EARLIEST.getPreferredName(), earliestTimestamp); + } + if (latestTimestamp != null) { + builder.field(LATEST.getPreferredName(), latestTimestamp); + } if (topHits.isEmpty() == false) { builder.field(TOP_HITS.getPreferredName(), topHits); } @@ -154,7 +198,7 @@ public static Number toIntegerIfInteger(double d) { @Override public int hashCode() { - return Objects.hash(count, cardinality, minValue, maxValue, meanValue, medianValue, topHits); + return Objects.hash(count, cardinality, minValue, maxValue, meanValue, medianValue, earliestTimestamp, latestTimestamp, topHits); } @Override @@ -175,6 +219,13 @@ public boolean equals(Object other) { Objects.equals(this.maxValue, that.maxValue) && Objects.equals(this.meanValue, that.meanValue) && Objects.equals(this.medianValue, that.medianValue) && + Objects.equals(this.earliestTimestamp, that.earliestTimestamp) && + Objects.equals(this.latestTimestamp, that.latestTimestamp) && Objects.equals(this.topHits, that.topHits); } + + @Override + public String toString() { + return Strings.toString(this); + } } diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStatsTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStatsTests.java index ec46d25edd409..889eae19387fc 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStatsTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStatsTests.java @@ -30,6 +30,8 @@ static FieldStats createTestFieldStats() { Double maxValue = null; Double meanValue = null; Double medianValue = null; + String earliestTimestamp = null; + String latestTimestamp = null; boolean isMetric = randomBoolean(); if (isMetric) { if (randomBoolean()) { @@ -41,6 +43,12 @@ static FieldStats createTestFieldStats() { } meanValue = randomDouble(); medianValue = randomDouble(); + } else { + boolean isDate = randomBoolean(); + if (isDate) { + earliestTimestamp = randomAlphaOfLength(20); + latestTimestamp = randomAlphaOfLength(20); + } } List> topHits = new ArrayList<>(); @@ -55,7 +63,7 @@ static FieldStats createTestFieldStats() { topHits.add(topHit); } - return new FieldStats(count, cardinality, minValue, maxValue, meanValue, medianValue, topHits); + return new FieldStats(count, cardinality, minValue, maxValue, meanValue, medianValue, earliestTimestamp, latestTimestamp, topHits); } @Override diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java index aa88905962638..91167fb12cfc9 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java @@ -159,8 +159,7 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List mappings = mappingsAndFieldStats.v1(); if (timeField != null) { - mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, - Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); + mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT); } if (mappingsAndFieldStats.v2() != null) { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculator.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculator.java index 40dff9116d7ab..39bf613165e2e 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculator.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculator.java @@ -5,10 +5,15 @@ */ package org.elasticsearch.xpack.ml.filestructurefinder; +import org.elasticsearch.common.time.DateFormatter; +import org.elasticsearch.common.time.DateFormatters; +import org.elasticsearch.index.mapper.DateFieldMapper; import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; +import java.time.Instant; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.Comparator; import java.util.LinkedHashMap; import java.util.List; @@ -26,8 +31,52 @@ public class FieldStatsCalculator { private long count; - private SortedMap countsByStringValue = new TreeMap<>(); - private SortedMap countsByNumericValue = new TreeMap<>(); + private SortedMap countsByStringValue; + private SortedMap countsByNumericValue; + private DateFormatter dateFormatter; + /** + * Parsed earliest and latest times. Some date formats may cause these to be + * wrong due to lack of information. For example, if the date format does not + * contain a year then these will be in 1970, and if there's no timezone in + * the format then these will be on the assumption the time was in UTC. However, + * since all the timestamps will be inaccurate in the same way the determination + * of the earliest and latest will still be correct. The trick then is to never + * print them out... + */ + private Instant earliestTimestamp; + private Instant latestTimestamp; + /** + * Earliest and latest times in the exact form they were present in the input, + * making the output immune to issues like not knowing the correct timezone + * or year when parsing. + */ + private String earliestTimeString; + private String latestTimeString; + + public FieldStatsCalculator(Map mapping) { + + switch (mapping.get(FileStructureUtils.MAPPING_TYPE_SETTING)) { + case "byte": + case "short": + case "integer": + case "long": + case "half_float": + case "float": + case "double": + countsByNumericValue = new TreeMap<>(); + break; + case "date": + case "date_nanos": + String format = mapping.get(FileStructureUtils.MAPPING_FORMAT_SETTING); + dateFormatter = (format == null) ? DateFieldMapper.DEFAULT_DATE_TIME_FORMATTER : DateFormatter.forPattern(format); + // Dates are treated like strings for top hits + countsByStringValue = new TreeMap<>(); + break; + default: + countsByStringValue = new TreeMap<>(); + break; + } + } /** * Add a collection of values to the calculator. @@ -41,14 +90,27 @@ public void accept(Collection fieldValues) { for (String fieldValue : fieldValues) { - countsByStringValue.compute(fieldValue, (k, v) -> (v == null) ? 1 : (1 + v)); - if (countsByNumericValue != null) { - try { countsByNumericValue.compute(Double.valueOf(fieldValue), (k, v) -> (v == null) ? 1 : (1 + v)); } catch (NumberFormatException e) { - countsByNumericValue = null; + // This should not happen in the usual context this class is used in within the file structure finder, + // as "double" should be big enough to hold any value that the file structure finder considers numeric + throw new IllegalArgumentException("Field with numeric mapping [" + fieldValue + "] could not be parsed as type double", + e); + } + } else { + countsByStringValue.compute(fieldValue, (k, v) -> (v == null) ? 1 : (1 + v)); + if (dateFormatter != null) { + Instant parsedTimestamp = DateFormatters.from(dateFormatter.parse(fieldValue)).toInstant(); + if (earliestTimestamp == null || earliestTimestamp.isAfter(parsedTimestamp)) { + earliestTimestamp = parsedTimestamp; + earliestTimeString = fieldValue; + } + if (latestTimestamp == null || latestTimestamp.isBefore(parsedTimestamp)) { + latestTimestamp = parsedTimestamp; + latestTimeString = fieldValue; + } } } } @@ -61,11 +123,17 @@ public void accept(Collection fieldValues) { */ public FieldStats calculate(int numTopHits) { - if (countsByNumericValue != null && countsByNumericValue.isEmpty() == false) { - return new FieldStats(count, countsByNumericValue.size(), countsByNumericValue.firstKey(), countsByNumericValue.lastKey(), - calculateMean(), calculateMedian(), findNumericTopHits(numTopHits)); + if (countsByNumericValue != null) { + if (countsByNumericValue.isEmpty()) { + assert count == 0; + return new FieldStats(count, 0, Collections.emptyList()); + } else { + assert count > 0; + return new FieldStats(count, countsByNumericValue.size(), countsByNumericValue.firstKey(), countsByNumericValue.lastKey(), + calculateMean(), calculateMedian(), findNumericTopHits(numTopHits)); + } } else { - return new FieldStats(count, countsByStringValue.size(), findStringTopHits(numTopHits)); + return new FieldStats(count, countsByStringValue.size(), earliestTimeString, latestTimeString, findStringTopHits(numTopHits)); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java index 90cc74c8d259c..e4945d3709860 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java @@ -29,6 +29,8 @@ public final class FileStructureUtils { public static final String MAPPING_TYPE_SETTING = "type"; public static final String MAPPING_FORMAT_SETTING = "format"; public static final String MAPPING_PROPERTIES_SETTING = "properties"; + public static final Map DATE_MAPPING_WITHOUT_FORMAT = + Collections.singletonMap(MAPPING_TYPE_SETTING, "date"); private static final int NUM_TOP_HITS = 10; // NUMBER Grok pattern doesn't support scientific notation, so we extend it @@ -231,7 +233,7 @@ static Tuple, FieldStats> guessMappingAndCalculateFieldStats Collection fieldValuesAsStrings = fieldValues.stream().map(Object::toString).collect(Collectors.toList()); Map mapping = guessScalarMapping(explanation, fieldName, fieldValuesAsStrings, timeoutChecker); timeoutChecker.check("mapping determination"); - return new Tuple<>(mapping, calculateFieldStats(fieldValuesAsStrings, timeoutChecker)); + return new Tuple<>(mapping, calculateFieldStats(mapping, fieldValuesAsStrings, timeoutChecker)); } private static Stream flatten(Object value) { @@ -323,13 +325,14 @@ else if (fieldValues.stream().allMatch(IP_GROK::match)) { /** * Calculate stats for a set of field values. + * @param mapping The mapping for the field. * @param fieldValues Values of the field for which field stats are to be calculated. * @param timeoutChecker Will abort the operation if its timeout is exceeded. * @return The stats calculated from the field values. */ - static FieldStats calculateFieldStats(Collection fieldValues, TimeoutChecker timeoutChecker) { + static FieldStats calculateFieldStats(Map mapping, Collection fieldValues, TimeoutChecker timeoutChecker) { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(mapping); calculator.accept(fieldValues); timeoutChecker.check("field stats calculation"); return calculator.calculate(NUM_TOP_HITS); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java index 7a5c9a48f8757..cb2bfeb00f268 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java @@ -195,14 +195,16 @@ public void validateFullLineGrokPattern(String grokPattern, String timestampFiel /** * Build a Grok pattern that will match all of the sample messages in their entirety. * @param seedPatternName A pattern that has already been determined to match some portion of every sample message. - * @param seedFieldName The field name to be used for the portion of every sample message that the seed pattern matches. + * @param seedMapping The mapping for the seed field. + * @param seedFieldName The field name to be used for the portion of every sample message that the seed pattern matches. * @return The built Grok pattern. */ - public String createGrokPatternFromExamples(String seedPatternName, String seedFieldName) { + public String createGrokPatternFromExamples(String seedPatternName, Map seedMapping, String seedFieldName) { overallGrokPatternBuilder.setLength(0); - GrokPatternCandidate seedCandidate = new NoMappingGrokPatternCandidate(seedPatternName, seedFieldName, grokPatternDefinitions); + GrokPatternCandidate seedCandidate = new PrecalculatedMappingGrokPatternCandidate(seedPatternName, seedMapping, seedFieldName, + grokPatternDefinitions); processCandidateAndSplit(seedCandidate, true, sampleMessages, false, 0, false, 0); @@ -433,7 +435,7 @@ String processCaptures(List explanation, Map fieldNameC static class ValueOnlyGrokPatternCandidate implements GrokPatternCandidate { private final String grokPatternName; - private final String mappingType; + private final Map mapping; private final String fieldName; private final Grok grok; @@ -451,7 +453,8 @@ static class ValueOnlyGrokPatternCandidate implements GrokPatternCandidate { * @param fieldName Name of the field to extract from the match. */ ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName) { - this(grokPatternName, mappingType, fieldName, "\\b", "\\b", Grok.getBuiltinPatterns()); + this(grokPatternName, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, mappingType), fieldName, + "\\b", "\\b", Grok.getBuiltinPatterns()); } /** @@ -462,7 +465,8 @@ static class ValueOnlyGrokPatternCandidate implements GrokPatternCandidate { */ ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName, Map grokPatternDefinitions) { - this(grokPatternName, mappingType, fieldName, "\\b", "\\b", grokPatternDefinitions); + this(grokPatternName, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, mappingType), fieldName, + "\\b", "\\b", grokPatternDefinitions); } /** @@ -473,25 +477,28 @@ static class ValueOnlyGrokPatternCandidate implements GrokPatternCandidate { * @param postBreak Only consider the match if it's broken from the following text by this. */ ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName, String preBreak, String postBreak) { - this(grokPatternName, mappingType, fieldName, preBreak, postBreak, Grok.getBuiltinPatterns()); + this(grokPatternName, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, mappingType), fieldName, + preBreak, postBreak, Grok.getBuiltinPatterns()); } /** * @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash. - * @param mappingType Data type for field in Elasticsearch mappings. + * @param mapping Elasticsearch mapping for the field. * @param fieldName Name of the field to extract from the match. * @param preBreak Only consider the match if it's broken from the previous text by this. * @param postBreak Only consider the match if it's broken from the following text by this. * @param grokPatternDefinitions Definitions of Grok patterns to be used. */ - ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName, String preBreak, String postBreak, - Map grokPatternDefinitions) { - this.grokPatternName = grokPatternName; - this.mappingType = mappingType; - this.fieldName = fieldName; + ValueOnlyGrokPatternCandidate(String grokPatternName, Map mapping, String fieldName, String preBreak, + String postBreak, Map grokPatternDefinitions) { + this.grokPatternName = Objects.requireNonNull(grokPatternName); + this.mapping = Collections.unmodifiableMap(mapping); + this.fieldName = Objects.requireNonNull(fieldName); // The (?m) here has the Ruby meaning, which is equivalent to (?s) in Java - grok = new Grok(grokPatternDefinitions, "(?m)%{DATA:" + PREFACE + "}" + preBreak + - "%{" + grokPatternName + ":" + VALUE + "}" + postBreak + "%{GREEDYDATA:" + EPILOGUE + "}", TimeoutChecker.watchdog); + grok = new Grok(grokPatternDefinitions, + "(?m)%{DATA:" + PREFACE + "}" + Objects.requireNonNull(preBreak) + + "%{" + grokPatternName + ":" + VALUE + "}" + Objects.requireNonNull(postBreak) + "%{GREEDYDATA:" + EPILOGUE + "}", + TimeoutChecker.watchdog); } @Override @@ -520,23 +527,24 @@ public String processCaptures(List explanation, Map fie epilogues.add(captures.getOrDefault(EPILOGUE, "").toString()); } String adjustedFieldName = buildFieldName(fieldNameCountStore, fieldName); - if (mappings != null) { - Map fullMappingType = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, mappingType); - if ("date".equals(mappingType)) { - try { - fullMappingType = FileStructureUtils.findTimestampMapping(explanation, values, timeoutChecker); - } catch (IllegalArgumentException e) { - // This feels like it shouldn't happen, but there may be some obscure edge case - // where it does, and in production it will cause less frustration to just return - // a mapping type of "date" with no format than to fail the whole analysis - assert e == null : e.getMessage(); - } - timeoutChecker.check("mapping determination"); + Map adjustedMapping = mapping; + // If the mapping is type "date" with no format, try to adjust it to include the format + if (FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT.equals(adjustedMapping)) { + try { + adjustedMapping = FileStructureUtils.findTimestampMapping(explanation, values, timeoutChecker); + } catch (IllegalArgumentException e) { + // This feels like it shouldn't happen, but there may be some obscure edge case + // where it does, and in production it will cause less frustration to just return + // a mapping type of "date" with no format than to fail the whole analysis + assert e == null : e.getMessage(); } - mappings.put(adjustedFieldName, fullMappingType); + timeoutChecker.check("mapping determination"); + } + if (mappings != null) { + mappings.put(adjustedFieldName, adjustedMapping); } if (fieldStats != null) { - fieldStats.put(adjustedFieldName, FileStructureUtils.calculateFieldStats(values, timeoutChecker)); + fieldStats.put(adjustedFieldName, FileStructureUtils.calculateFieldStats(adjustedMapping, values, timeoutChecker)); } return "%{" + grokPatternName + ":" + adjustedFieldName + "}"; } @@ -598,13 +606,13 @@ public String processCaptures(List explanation, Map fie timeoutChecker.check("full message Grok pattern field extraction"); } String adjustedFieldName = buildFieldName(fieldNameCountStore, fieldName); + Map mapping = FileStructureUtils.guessScalarMapping(explanation, adjustedFieldName, values, timeoutChecker); + timeoutChecker.check("mapping determination"); if (mappings != null) { - mappings.put(adjustedFieldName, - FileStructureUtils.guessScalarMapping(explanation, adjustedFieldName, values, timeoutChecker)); - timeoutChecker.check("mapping determination"); + mappings.put(adjustedFieldName, mapping); } if (fieldStats != null) { - fieldStats.put(adjustedFieldName, FileStructureUtils.calculateFieldStats(values, timeoutChecker)); + fieldStats.put(adjustedFieldName, FileStructureUtils.calculateFieldStats(mapping, values, timeoutChecker)); } return "\\b" + fieldName + "=%{USER:" + adjustedFieldName + "}"; } @@ -613,10 +621,11 @@ public String processCaptures(List explanation, Map fie /** * A Grok pattern candidate that matches a single named Grok pattern but will not update mappings. */ - static class NoMappingGrokPatternCandidate extends ValueOnlyGrokPatternCandidate { + static class PrecalculatedMappingGrokPatternCandidate extends ValueOnlyGrokPatternCandidate { - NoMappingGrokPatternCandidate(String grokPatternName, String fieldName, Map grokPatternDefinitions) { - super(grokPatternName, null, fieldName, grokPatternDefinitions); + PrecalculatedMappingGrokPatternCandidate(String grokPatternName, Map mapping, String fieldName, + Map grokPatternDefinitions) { + super(grokPatternName, mapping, fieldName, "\\b", "\\b", grokPatternDefinitions); } @Override @@ -710,16 +719,16 @@ public Tuple processMatch(List explanation, Collection> valuesForField : valuesPerField.entrySet()) { String fieldName = valuesForField.getKey(); - if (mappings != null) { - // Exclude the time field because that will be dropped and replaced with @timestamp - if (fieldName.equals(timeField) == false) { - mappings.put(fieldName, - FileStructureUtils.guessScalarMapping(explanation, fieldName, valuesForField.getValue(), timeoutChecker)); - timeoutChecker.check("mapping determination"); - } + Map mapping = + FileStructureUtils.guessScalarMapping(explanation, fieldName, valuesForField.getValue(), timeoutChecker); + timeoutChecker.check("mapping determination"); + // Exclude the time field because that will be dropped and replaced with @timestamp + if (mappings != null && fieldName.equals(timeField) == false) { + mappings.put(fieldName, mapping); } if (fieldStats != null) { - fieldStats.put(fieldName, FileStructureUtils.calculateFieldStats(valuesForField.getValue(), timeoutChecker)); + fieldStats.put(fieldName, + FileStructureUtils.calculateFieldStats(mapping, valuesForField.getValue(), timeoutChecker)); } } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinder.java index 116de8f7679d2..1b405eb685fa2 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinder.java @@ -70,8 +70,7 @@ static NdJsonFileStructureFinder makeNdJsonFileStructureFinder(List expl SortedMap mappings = mappingsAndFieldStats.v1(); if (timeField != null) { - mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, - Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); + mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT); } if (mappingsAndFieldStats.v2() != null) { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java index 86b1d79b8b66b..e47d045dd257e 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java @@ -108,12 +108,13 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List ex .setNumMessagesAnalyzed(sampleMessages.size()) .setMultilineStartPattern(multiLineRegex); + Map messageMapping = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "text"); SortedMap mappings = new TreeMap<>(); - mappings.put("message", Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "text")); - mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); + mappings.put("message", messageMapping); + mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT); SortedMap fieldStats = new TreeMap<>(); - fieldStats.put("message", FileStructureUtils.calculateFieldStats(sampleMessages, timeoutChecker)); + fieldStats.put("message", FileStructureUtils.calculateFieldStats(messageMapping, sampleMessages, timeoutChecker)); Map customGrokPatternDefinitions = timestampFormatFinder.getCustomGrokPatternDefinitions(); GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, fieldStats, @@ -136,8 +137,8 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List ex if (interimTimestampField == null) { interimTimestampField = "timestamp"; } - grokPattern = - grokPatternCreator.createGrokPatternFromExamples(timestampFormatFinder.getGrokPatternName(), interimTimestampField); + grokPattern = grokPatternCreator.createGrokPatternFromExamples(timestampFormatFinder.getGrokPatternName(), + timestampFormatFinder.getEsDateMappingTypeWithFormat(), interimTimestampField); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java index d2572b7fd2085..91fc61bcbd4b4 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java @@ -120,8 +120,7 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List explanatio SortedMap outerMappings = new TreeMap<>(); outerMappings.put(topLevelTag, secondLevelProperties); if (timeField != null) { - outerMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, - Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); + outerMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT); } FileStructure structure = structureBuilder diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculatorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculatorTests.java index 30445a4a77c10..4efaf64bd092c 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculatorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculatorTests.java @@ -10,14 +10,19 @@ import java.util.Arrays; import java.util.Collections; import java.util.DoubleSummaryStatistics; +import java.util.HashMap; import java.util.List; import java.util.Map; public class FieldStatsCalculatorTests extends FileStructureTestCase { + private static final Map LONG = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"); + private static final Map DOUBLE = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "double"); + private static final Map KEYWORD = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"); + public void testMean() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(DOUBLE); calculator.accept(Arrays.asList("1", "3.5", "2.5", "9")); @@ -26,7 +31,7 @@ public void testMean() { public void testMedianGivenOddCount() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(LONG); calculator.accept(Arrays.asList("3", "23", "-1", "5", "1000")); @@ -35,7 +40,7 @@ public void testMedianGivenOddCount() { public void testMedianGivenOddCountMinimal() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(LONG); calculator.accept(Collections.singletonList("3")); @@ -44,7 +49,7 @@ public void testMedianGivenOddCountMinimal() { public void testMedianGivenEvenCountMiddleValuesDifferent() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(LONG); calculator.accept(Arrays.asList("3", "23", "-1", "5", "1000", "6")); @@ -53,7 +58,7 @@ public void testMedianGivenEvenCountMiddleValuesDifferent() { public void testMedianGivenEvenCountMiddleValuesSame() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(LONG); calculator.accept(Arrays.asList("3", "23", "-1", "5", "1000", "5")); @@ -62,7 +67,7 @@ public void testMedianGivenEvenCountMiddleValuesSame() { public void testMedianGivenEvenCountMinimal() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(LONG); calculator.accept(Arrays.asList("4", "4")); @@ -71,7 +76,7 @@ public void testMedianGivenEvenCountMinimal() { public void testTopHitsNumeric() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(DOUBLE); calculator.accept(Arrays.asList("4", "4", "7", "4", "6", "5.2", "6", "5.2", "16", "4", "5.2")); @@ -88,7 +93,7 @@ public void testTopHitsNumeric() { public void testTopHitsString() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(KEYWORD); calculator.accept(Arrays.asList("s", "s", "d", "s", "f", "x", "f", "x", "n", "s", "x")); @@ -105,7 +110,8 @@ public void testTopHitsString() { public void testCalculateGivenEmpty() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = + new FieldStatsCalculator(randomFrom(Arrays.asList(LONG, DOUBLE, KEYWORD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT))); calculator.accept(Collections.emptyList()); @@ -117,12 +123,15 @@ public void testCalculateGivenEmpty() { assertNull(stats.getMaxValue()); assertNull(stats.getMeanValue()); assertNull(stats.getMedianValue()); + assertNull(stats.getEarliestTimestamp()); + assertNull(stats.getLatestTimestamp()); + assertEquals(0, stats.getTopHits().size()); } public void testCalculateGivenNumericField() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(DOUBLE); calculator.accept(Arrays.asList("4.5", "4.5", "7", "4.5", "6", "5", "6", "5", "25", "4.5", "5")); @@ -134,6 +143,8 @@ public void testCalculateGivenNumericField() { assertEquals(25.0, stats.getMaxValue(), 1e-10); assertEquals(7.0, stats.getMeanValue(), 1e-10); assertEquals(5.0, stats.getMedianValue(), 1e-10); + assertNull(stats.getEarliestTimestamp()); + assertNull(stats.getLatestTimestamp()); List> topHits = stats.getTopHits(); @@ -148,7 +159,7 @@ public void testCalculateGivenNumericField() { public void testCalculateGivenStringField() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(KEYWORD); calculator.accept(Arrays.asList("s", "s", "d", "s", "f", "x", "f", "x", "n", "s", "x")); @@ -160,6 +171,8 @@ public void testCalculateGivenStringField() { assertNull(stats.getMaxValue()); assertNull(stats.getMeanValue()); assertNull(stats.getMedianValue()); + assertNull(stats.getEarliestTimestamp()); + assertNull(stats.getLatestTimestamp()); List> topHits = stats.getTopHits(); @@ -174,7 +187,7 @@ public void testCalculateGivenStringField() { public void testCalculateGivenMixedField() { - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(KEYWORD); calculator.accept(Arrays.asList("4", "4", "d", "4", "f", "x", "f", "x", "16", "4", "x")); @@ -186,6 +199,8 @@ public void testCalculateGivenMixedField() { assertNull(stats.getMaxValue()); assertNull(stats.getMeanValue()); assertNull(stats.getMedianValue()); + assertNull(stats.getEarliestTimestamp()); + assertNull(stats.getLatestTimestamp()); List> topHits = stats.getTopHits(); @@ -198,10 +213,71 @@ public void testCalculateGivenMixedField() { assertEquals(2, topHits.get(2).get("count")); } + public void testGivenDateFieldWithoutFormat() { + + FieldStatsCalculator calculator = new FieldStatsCalculator(FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT); + + calculator.accept(Arrays.asList("2018-10-08T10:49:16.642", "2018-10-08T10:49:16.642", "2018-10-08T10:49:16.642", + "2018-09-08T11:12:13.789", "2019-01-28T01:02:03.456", "2018-09-08T11:12:13.789")); + + FieldStats stats = calculator.calculate(3); + + assertEquals(6L, stats.getCount()); + assertEquals(3, stats.getCardinality()); + assertNull(stats.getMinValue()); + assertNull(stats.getMaxValue()); + assertNull(stats.getMeanValue()); + assertNull(stats.getMedianValue()); + assertEquals("2018-09-08T11:12:13.789", stats.getEarliestTimestamp()); + assertEquals("2019-01-28T01:02:03.456", stats.getLatestTimestamp()); + + List> topHits = stats.getTopHits(); + + assertEquals(3, topHits.size()); + assertEquals("2018-10-08T10:49:16.642", topHits.get(0).get("value")); + assertEquals(3, topHits.get(0).get("count")); + assertEquals("2018-09-08T11:12:13.789", topHits.get(1).get("value")); + assertEquals(2, topHits.get(1).get("count")); + assertEquals("2019-01-28T01:02:03.456", topHits.get(2).get("value")); + assertEquals(1, topHits.get(2).get("count")); + } + + public void testGivenDateFieldWithFormat() { + + Map dateMapping = new HashMap<>(); + dateMapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date"); + dateMapping.put(FileStructureUtils.MAPPING_FORMAT_SETTING, "M/dd/yyyy h:mma"); + FieldStatsCalculator calculator = new FieldStatsCalculator(dateMapping); + + calculator.accept(Arrays.asList("10/08/2018 10:49AM", "10/08/2018 10:49AM", "10/08/2018 10:49AM", + "9/08/2018 11:12AM", "1/28/2019 1:02AM", "9/08/2018 11:12AM")); + + FieldStats stats = calculator.calculate(3); + + assertEquals(6L, stats.getCount()); + assertEquals(3, stats.getCardinality()); + assertNull(stats.getMinValue()); + assertNull(stats.getMaxValue()); + assertNull(stats.getMeanValue()); + assertNull(stats.getMedianValue()); + assertEquals("9/08/2018 11:12AM", stats.getEarliestTimestamp()); + assertEquals("1/28/2019 1:02AM", stats.getLatestTimestamp()); + + List> topHits = stats.getTopHits(); + + assertEquals(3, topHits.size()); + assertEquals("10/08/2018 10:49AM", topHits.get(0).get("value")); + assertEquals(3, topHits.get(0).get("count")); + assertEquals("9/08/2018 11:12AM", topHits.get(1).get("value")); + assertEquals(2, topHits.get(1).get("count")); + assertEquals("1/28/2019 1:02AM", topHits.get(2).get("value")); + assertEquals(1, topHits.get(2).get("count")); + } + public void testJavaStatsEquivalence() { DoubleSummaryStatistics summaryStatistics = new DoubleSummaryStatistics(); - FieldStatsCalculator calculator = new FieldStatsCalculator(); + FieldStatsCalculator calculator = new FieldStatsCalculator(DOUBLE); for (int numValues = randomIntBetween(1000, 10000); numValues > 0; --numValues) { diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java index c0adccd0eb477..91568573c9d71 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java @@ -323,8 +323,7 @@ public void testGuessMappingsAndCalculateFieldStats() { sample2.put("nothing", null); Tuple, SortedMap> mappingsAndFieldStats = - FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, Arrays.asList(sample1, sample2), - NOOP_TIMEOUT_CHECKER); + FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, Arrays.asList(sample1, sample2), NOOP_TIMEOUT_CHECKER); assertNotNull(mappingsAndFieldStats); Map mappings = mappingsAndFieldStats.v1(); @@ -341,7 +340,8 @@ public void testGuessMappingsAndCalculateFieldStats() { assertNotNull(fieldStats); assertEquals(3, fieldStats.size()); assertEquals(new FieldStats(2, 2, makeTopHits("not a time", 1, "whatever", 1)), fieldStats.get("foo")); - assertEquals(new FieldStats(2, 2, makeTopHits("2018-05-24 17:28:31,735", 1, "2018-05-29 11:53:02,837", 1)), fieldStats.get("time")); + assertEquals(new FieldStats(2, 2, "2018-05-24 17:28:31,735", "2018-05-29 11:53:02,837", + makeTopHits("2018-05-24 17:28:31,735", 1, "2018-05-29 11:53:02,837", 1)), fieldStats.get("time")); assertEquals(new FieldStats(2, 2, 17.0, 42.0, 29.5, 29.5, makeTopHits(17, 1, 42, 1)), fieldStats.get("bar")); assertNull(fieldStats.get("nothing")); } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java index 7e6363602dcdd..967c6d42921e4 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java @@ -203,7 +203,8 @@ public void testCreateGrokPatternFromExamplesGivenNamedLogs() { assertEquals("%{SYSLOGTIMESTAMP:timestamp} .*? .*?\\[%{INT:field}\\]: %{LOGLEVEL:loglevel} \\(.*? .*? .*?\\) .*? " + "%{QUOTEDSTRING:field2}: %{IP:ipaddress}#%{INT:field3}", - grokPatternCreator.createGrokPatternFromExamples("SYSLOGTIMESTAMP", "timestamp")); + grokPatternCreator.createGrokPatternFromExamples("SYSLOGTIMESTAMP", FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT, + "timestamp")); assertEquals(5, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel")); @@ -229,7 +230,8 @@ public void testCreateGrokPatternFromExamplesGivenCatalinaLogs() { NOOP_TIMEOUT_CHECKER); assertEquals("%{CATALINA_DATESTAMP:timestamp} .*? .*?\\n%{LOGLEVEL:loglevel}: .*", - grokPatternCreator.createGrokPatternFromExamples("CATALINA_DATESTAMP", "timestamp")); + grokPatternCreator.createGrokPatternFromExamples("CATALINA_DATESTAMP", FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT, + "timestamp")); assertEquals(1, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel")); } @@ -253,7 +255,8 @@ public void testCreateGrokPatternFromExamplesGivenMultiTimestampLogs() { assertEquals("%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{TIMESTAMP_ISO8601:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" + "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*", - grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", "timestamp")); + grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT, + "timestamp")); assertEquals(5, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); Map expectedDateMapping = new HashMap<>(); @@ -284,7 +287,8 @@ public void testCreateGrokPatternFromExamplesGivenMultiTimestampLogsAndIndetermi assertEquals("%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{DATESTAMP:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" + "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*", - grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", "timestamp")); + grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT, + "timestamp")); assertEquals(5, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); Map expectedDateMapping = new HashMap<>(); @@ -314,9 +318,12 @@ public void testCreateGrokPatternFromExamplesGivenMultiTimestampLogsAndCustomDef Collections.singletonMap("CUSTOM_TIMESTAMP", "%{MONTHNUM}/%{MONTHDAY}/%{YEAR} %{HOUR}:%{MINUTE}(?:AM|PM)"), NOOP_TIMEOUT_CHECKER); + Map customMapping = new HashMap<>(); + customMapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date"); + customMapping.put(FileStructureUtils.MAPPING_FORMAT_SETTING, "M/dd/yyyy h:mma"); assertEquals("%{INT:field}\\t%{CUSTOM_TIMESTAMP:timestamp}\\t%{TIMESTAMP_ISO8601:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" + "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*", - grokPatternCreator.createGrokPatternFromExamples("CUSTOM_TIMESTAMP", "timestamp")); + grokPatternCreator.createGrokPatternFromExamples("CUSTOM_TIMESTAMP", customMapping, "timestamp")); assertEquals(5, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); Map expectedDateMapping = new HashMap<>(); @@ -347,7 +354,8 @@ public void testCreateGrokPatternFromExamplesGivenTimestampAndTimeWithoutDate() assertEquals("%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{TIME:time}\\t%{INT:field2}\\t.*?\\t" + "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*", - grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", "timestamp")); + grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT, + "timestamp")); assertEquals(5, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("time")); diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml index a9634605aaac2..cf175961b1ef9 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml @@ -56,7 +56,8 @@ setup: - match: { field_stats.sourcetype.cardinality: 1 } - match: { field_stats.time.count: 3 } - match: { field_stats.time.cardinality: 3 } - - match: { field_stats.time.cardinality: 3 } + - match: { field_stats.time.earliest: "1403481600" } + - match: { field_stats.time.latest: "1403481800" } - is_false: explanation --- @@ -116,5 +117,6 @@ setup: - match: { field_stats.sourcetype.cardinality: 1 } - match: { field_stats.time.count: 3 } - match: { field_stats.time.cardinality: 3 } - - match: { field_stats.time.cardinality: 3 } + - match: { field_stats.time.earliest: "1403481600" } + - match: { field_stats.time.latest: "1403481800" } - match: { explanation.0: "Using specified character encoding [UTF-8]" }