diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 36f80ec90a95..4ab040abf4dc 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -133,7 +133,7 @@ use arrow_schema::*; use chrono::{TimeZone, Utc}; use csv::StringRecord; use lazy_static::lazy_static; -use regex::{Regex, RegexSet}; +use regex::Regex; use std::fmt::{self, Debug}; use std::fs::File; use std::io::{BufRead, BufReader as StdBufReader, Read, Seek, SeekFrom}; @@ -145,16 +145,24 @@ use arrow_array::timezone::Tz; lazy_static! { /// Order should match [`InferredDataType`] - static ref REGEX_SET: RegexSet = RegexSet::new([ - r"(?i)^(true)$|^(false)$(?-i)", //BOOLEAN - r"^-?(\d+)$", //INTEGER - r"^-?((\d*\.\d+|\d+\.\d*)([eE][-+]?\d+)?|\d+([eE][-+]?\d+))$", //DECIMAL - r"^\d{4}-\d\d-\d\d$", //DATE32 - r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d(?:[^\d\.].*)?$", //Timestamp(Second) - r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,3}(?:[^\d].*)?$", //Timestamp(Millisecond) - r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,6}(?:[^\d].*)?$", //Timestamp(Microsecond) - r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,9}(?:[^\d].*)?$", //Timestamp(Nanosecond) - ]).unwrap(); + static ref INFER_TYPE_FUNC: [fn(&str) -> bool; 8] = [ + |s| s.eq_ignore_ascii_case("true") || s.eq_ignore_ascii_case("false"), // Boolean + |s| s.parse::().is_ok(), // Integer + // TODO: replace regex with functions. + |s| DECIMAL_REGEX.is_match(s), // Decimal + |s| DATE32_REGEX.is_match(s), // Date32 + |s| TIMESTAMP_SECOND_REGEX.is_match(s), // Timestamp(Second) + |s| TIMESTAMP_MILLISECOND_REGEX.is_match(s), // Timestamp(Millisecond) + |s| TIMESTAMP_MICROSECOND_REGEX.is_match(s), // Timestamp(Microsecond) + |s| TIMESTAMP_NANOSECOND_REGEX.is_match(s), // Timestamp(Nanosecond) + ]; + + static ref DECIMAL_REGEX: Regex = Regex::new(r"(^-?((\d*\.\d+|\d+\.\d*)([eE][-+]?\d+)?|\d+([eE][-+]?\d+))$)|(^-?(\d+)$)").unwrap(); + static ref DATE32_REGEX: Regex = Regex::new(r"^\d{4}-\d\d-\d\d$").unwrap(); + static ref TIMESTAMP_SECOND_REGEX: Regex = Regex::new(r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d(?:[^\d\.].*)?$").unwrap(); + static ref TIMESTAMP_MILLISECOND_REGEX: Regex = Regex::new(r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,3}(?:[^\d].*)?$").unwrap(); + static ref TIMESTAMP_MICROSECOND_REGEX: Regex = Regex::new(r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,6}(?:[^\d].*)?$").unwrap(); + static ref TIMESTAMP_NANOSECOND_REGEX: Regex = Regex::new(r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,9}(?:[^\d].*)?$").unwrap(); } /// A wrapper over `Option` to check if the value is `NULL`. @@ -214,7 +222,7 @@ impl InferredDataType { fn update(&mut self, string: &str) { self.packed |= if string.starts_with('"') { 1 << 8 // Utf8 - } else if let Some(m) = REGEX_SET.matches(string).into_iter().next() { + } else if let Some(m) = INFER_TYPE_FUNC.iter().position(|f| f(string)) { 1 << m } else { 1 << 8 // Utf8