[ML] Add support for date_nanos fields in find_file_structure

Now that elastic#61324 is merged it is possible for the find_file_structure endpoint to suggest using date_nanos fields for timestamps where the timestamp format provides greater than millisecond accuracy.
droberts195 · Sep 7, 2020 · 80d1da5 · 80d1da5
1 parent b08f121
commit 80d1da5
Show file tree

Hide file tree

Showing 9 changed files with 145 additions and 22 deletions.
diff --git a/...ain/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java b/...ain/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java
@@ -149,14 +149,15 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List<String
                 .setJavaTimestampFormats(timeField.v2().getJavaTimestampFormats())
                 .setNeedClientTimezone(needClientTimeZone)
                 .setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), csvProcessorSettings,
-                    mappings, timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone))
+                    mappings, timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone,
+                    timeField.v2().needNanosecondPrecision()))
                 .setMultilineStartPattern(makeMultilineStartPattern(explanation, columnNamesList, maxLinesPerMessage, delimiterPattern,
                     quotePattern, mappings, timeField.v1(), timeField.v2()));
 
-            mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
+            mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timeField.v2().getEsDateMappingTypeWithoutFormat());
         } else {
             structureBuilder.setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(),
-                csvProcessorSettings, mappings, null, null, false));
+                csvProcessorSettings, mappings, null, null, false, false));
             structureBuilder.setMultilineStartPattern(makeMultilineStartPattern(explanation, columnNamesList, maxLinesPerMessage,
                 delimiterPattern, quotePattern, mappings, null, null));
         }

diff --git a/...n/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java b/...n/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java
@@ -36,6 +36,7 @@ public final class FileStructureUtils {
     public static final String MAPPING_PROPERTIES_SETTING = "properties";
     public static final Map<String, String> DATE_MAPPING_WITHOUT_FORMAT =
         Collections.singletonMap(MAPPING_TYPE_SETTING, "date");
+    public static final String NANOSECOND_DATE_OUTPUT_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSSXXX";
     public static final Set<String> CONVERTIBLE_TYPES =
         Collections.unmodifiableSet(Sets.newHashSet("integer", "long", "float", "double", "boolean"));
 
@@ -397,13 +398,14 @@ static boolean isMoreLikelyTextThanKeyword(String str) {
      * @param timestampFormats Timestamp formats to be used for parsing {@code timestampField}.
      *                         May be <code>null</code> if {@code timestampField} is also <code>null</code>.
      * @param needClientTimezone Is the timezone of the client supplying data to ingest required to uniquely parse the timestamp?
+     * @param needNanosecondPrecision Does the timestamp have more than millisecond accuracy?
      * @return The ingest pipeline definition, or <code>null</code> if none is required.
      */
     public static Map<String, Object> makeIngestPipelineDefinition(String grokPattern, Map<String, String> customGrokPatternDefinitions,
                                                                    Map<String, Object> csvProcessorSettings,
                                                                    Map<String, Object> mappingsForConversions,
                                                                    String timestampField, List<String> timestampFormats,
-                                                                   boolean needClientTimezone) {
+                                                                   boolean needClientTimezone, boolean needNanosecondPrecision) {
 
         if (grokPattern == null && csvProcessorSettings == null && timestampField == null) {
             return null;
@@ -437,6 +439,9 @@ public static Map<String, Object> makeIngestPipelineDefinition(String grokPatter
                 dateProcessorSettings.put("timezone", "{{ " + BEAT_TIMEZONE_FIELD + " }}");
             }
             dateProcessorSettings.put("formats", timestampFormats);
+            if (needNanosecondPrecision) {
+                dateProcessorSettings.put("output_format", NANOSECOND_DATE_OUTPUT_FORMAT);
+            }
             processors.add(Collections.singletonMap("date", dateProcessorSettings));
         }
 

diff --git a/...c/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinder.java b/...c/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinder.java
@@ -64,15 +64,16 @@ static NdJsonFileStructureFinder makeNdJsonFileStructureFinder(List<String> expl
                 .setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), null,
                     // Note: no convert processors are added based on mappings for NDJSON input
                     // because it's reasonable that _source matches the supplied JSON precisely
-                    Collections.emptyMap(), timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone));
+                    Collections.emptyMap(), timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone,
+                    timeField.v2().needNanosecondPrecision()));
         }
 
         Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>> mappingsAndFieldStats =
             FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords, timeoutChecker);
 
-        SortedMap<String, Object> mappings = mappingsAndFieldStats.v1();
+        Map<String, Object> mappings = mappingsAndFieldStats.v1();
         if (timeField != null) {
-            mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
+            mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timeField.v2().getEsDateMappingTypeWithoutFormat());
         }
 
         if (mappingsAndFieldStats.v2() != null) {

diff --git a/.../main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java b/.../main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java
@@ -111,7 +111,7 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List<String> ex
         Map<String, String> messageMapping = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "text");
         SortedMap<String, Object> mappings = new TreeMap<>();
         mappings.put("message", messageMapping);
-        mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
+        mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timestampFormatFinder.getEsDateMappingTypeWithoutFormat());
 
         SortedMap<String, FieldStats> fieldStats = new TreeMap<>();
         fieldStats.put("message", FileStructureUtils.calculateFieldStats(messageMapping, sampleMessages, timeoutChecker));
@@ -151,7 +151,8 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List<String> ex
             .setNeedClientTimezone(needClientTimeZone)
             .setGrokPattern(grokPattern)
             .setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(grokPattern, customGrokPatternDefinitions, null, mappings,
-                interimTimestampField, timestampFormatFinder.getJavaTimestampFormats(), needClientTimeZone))
+                interimTimestampField, timestampFormatFinder.getJavaTimestampFormats(), needClientTimeZone,
+                timestampFormatFinder.needNanosecondPrecision()))
             .setMappings(mappings)
             .setFieldStats(fieldStats)
             .setExplanation(explanation)

diff --git a/...l/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java b/...l/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java
@@ -53,6 +53,8 @@ public final class TimestampFormatFinder {
     private static final Logger logger = LogManager.getLogger(TimestampFormatFinder.class);
     private static final String PUNCTUATION_THAT_NEEDS_ESCAPING_IN_REGEX = "\\|()[]{}^$.*?";
     private static final String FRACTIONAL_SECOND_SEPARATORS = ":.,";
+    private static final Pattern FRACTIONAL_SECOND_INTERPRETER =
+        Pattern.compile("([" + FRACTIONAL_SECOND_SEPARATORS + "])(\\d{3,9})($|[Z+-])");
     private static final char INDETERMINATE_FIELD_PLACEHOLDER = '?';
     // The ? characters in this must match INDETERMINATE_FIELD_PLACEHOLDER
     // above, but they're literals in this regex to aid readability
@@ -702,6 +704,20 @@ public List<String> getJavaTimestampFormats() {
             (matchedFormats.size() > 1) ? matchedFormats.get(0) : null);
     }
 
+    /**
+     * This is needed to decide between "date" and "date_nanos" as the index mapping type.
+     * @return Do the observed timestamps require nanosecond precision to store accurately?
+     */
+    public boolean needNanosecondPrecision() {
+        if (matchedFormats.isEmpty()) {
+            // If errorOnNoTimestamp is set and we get here it means no samples have been added, which is likely a programmer mistake
+            assert errorOnNoTimestamp == false;
+            return false;
+        }
+        return matches.stream().filter(match -> matchedFormats.size() < 2 || matchedFormats.get(0).canMergeWith(match.timestampFormat))
+            .anyMatch(match -> match.hasNanosecondPrecision);
+    }
+
     /**
      * Given a list of timestamp formats that might contain indeterminate day/month parts,
      * return the corresponding pattern with the placeholders replaced with concrete
@@ -947,6 +963,14 @@ public boolean hasTimezoneDependentParsing() {
             .anyMatch(match -> match.hasTimezoneDependentParsing);
     }
 
+    /**
+     * The @timestamp field will always have been parsed into epoch format,
+     * so we just need to know if it has nanosecond resolution or not.
+     */
+    public Map<String, String> getEsDateMappingTypeWithoutFormat() {
+        return Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, needNanosecondPrecision() ? "date_nanos" : "date");
+    }
+
     /**
      * Sometimes Elasticsearch mappings for dates need to include the format.
      * This method returns appropriate mappings settings: at minimum "type" : "date",
@@ -959,7 +983,7 @@ public Map<String, String> getEsDateMappingTypeWithFormat() {
             return Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword");
         }
         Map<String, String> mapping = new LinkedHashMap<>();
-        mapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date");
+        mapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, needNanosecondPrecision() ? "date_nanos" : "date");
         String formats = javaTimestampFormats.stream().map(format -> {
             switch (format) {
                 case "ISO8601":
@@ -1233,6 +1257,7 @@ static final class TimestampMatch {
         final int secondIndeterminateDateNumber;
 
         final boolean hasTimezoneDependentParsing;
+        final boolean hasNanosecondPrecision;
 
         /**
          * Text that came after the timestamp in the matched field/message.
@@ -1250,6 +1275,8 @@ static final class TimestampMatch {
             this.secondIndeterminateDateNumber = indeterminateDateNumbers[1];
             this.hasTimezoneDependentParsing = requiresTimezoneDependentParsing(timestampFormat.rawJavaTimestampFormats.get(0),
                 matchedDate);
+            this.hasNanosecondPrecision = matchHasNanosecondPrecision(timestampFormat.rawJavaTimestampFormats.get(0),
+                matchedDate);
             this.epilogue = Objects.requireNonNull(epilogue);
         }
 
@@ -1259,6 +1286,7 @@ static final class TimestampMatch {
             this.firstIndeterminateDateNumber = toCopyExceptFormat.firstIndeterminateDateNumber;
             this.secondIndeterminateDateNumber = toCopyExceptFormat.secondIndeterminateDateNumber;
             this.hasTimezoneDependentParsing = toCopyExceptFormat.hasTimezoneDependentParsing;
+            this.hasNanosecondPrecision = toCopyExceptFormat.hasNanosecondPrecision;
             this.epilogue = toCopyExceptFormat.epilogue;
         }
 
@@ -1285,6 +1313,38 @@ static boolean requiresTimezoneDependentParsing(String format, String matchedDat
             }
         }
 
+        static boolean matchHasNanosecondPrecision(String format, String matchedDate) {
+            switch (format) {
+                case "ISO8601":
+                    Matcher matcher = FRACTIONAL_SECOND_INTERPRETER.matcher(matchedDate);
+                    return matcher.find() && matcher.group(2).length() > 3;
+                case "UNIX_MS":
+                case "UNIX":
+                    return false;
+                case "TAI64N":
+                    return true;
+                default:
+                    boolean notQuoted = true;
+                    int consecutiveSs = 0;
+                    for (int pos = 0; pos < format.length(); ++pos) {
+                        char curChar = format.charAt(pos);
+                        if (curChar == '\'') {
+                            notQuoted = !notQuoted;
+                            consecutiveSs = 0;
+                        } else if (notQuoted) {
+                            if (curChar == 'S') {
+                                if (++consecutiveSs > 3) {
+                                    return true;
+                                }
+                            } else {
+                                consecutiveSs = 0;
+                            }
+                        }
+                    }
+                    return false;
+            }
+        }
+
         static int[] parseIndeterminateDateNumbers(String matchedDate, List<String> rawJavaTimestampFormats) {
             int[] indeterminateDateNumbers = { -1, -1 };
 
@@ -1368,7 +1428,6 @@ public String toString() {
      */
     static final class CandidateTimestampFormat {
 
-        private static final Pattern FRACTIONAL_SECOND_INTERPRETER = Pattern.compile("([" + FRACTIONAL_SECOND_SEPARATORS + "])(\\d{3,9})$");
         // This means that in the case of a literal Z, XXX is preferred
         private static final Pattern TRAILING_OFFSET_WITHOUT_COLON_FINDER = Pattern.compile("[+-]\\d{4}$");
 

diff --git a/.../src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java b/.../src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java
@@ -104,7 +104,7 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List<String> explanatio
                 .setNeedClientTimezone(needClientTimeZone)
                 .setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), null,
                     Collections.emptyMap(), topLevelTag + "." + timeField.v1(), timeField.v2().getJavaTimestampFormats(),
-                    needClientTimeZone));
+                    needClientTimeZone, timeField.v2().needNanosecondPrecision()));
         }
 
         Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>> mappingsAndFieldStats =
@@ -114,14 +114,14 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List<String> explanatio
             structureBuilder.setFieldStats(mappingsAndFieldStats.v2());
         }
 
-        SortedMap<String, Object> innerMappings = mappingsAndFieldStats.v1();
+        Map<String, Object> innerMappings = mappingsAndFieldStats.v1();
         Map<String, Object> secondLevelProperties = new LinkedHashMap<>();
         secondLevelProperties.put(FileStructureUtils.MAPPING_TYPE_SETTING, "object");
         secondLevelProperties.put(FileStructureUtils.MAPPING_PROPERTIES_SETTING, innerMappings);
         SortedMap<String, Object> outerMappings = new TreeMap<>();
         outerMappings.put(topLevelTag, secondLevelProperties);
         if (timeField != null) {
-            outerMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
+            outerMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timeField.v2().getEsDateMappingTypeWithoutFormat());
         }
 
         FileStructure structure = structureBuilder