Skip to content

Commit

Permalink
refactor: use functions to infer types in csv
Browse files Browse the repository at this point in the history
  • Loading branch information
CookiePieWw committed Sep 30, 2024
1 parent f0e39cc commit 981316f
Showing 1 changed file with 20 additions and 12 deletions.
32 changes: 20 additions & 12 deletions arrow-csv/src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ use arrow_schema::*;
use chrono::{TimeZone, Utc};
use csv::StringRecord;
use lazy_static::lazy_static;
use regex::{Regex, RegexSet};
use regex::Regex;
use std::fmt::{self, Debug};
use std::fs::File;
use std::io::{BufRead, BufReader as StdBufReader, Read, Seek, SeekFrom};
Expand All @@ -145,16 +145,24 @@ use arrow_array::timezone::Tz;

lazy_static! {
/// Order should match [`InferredDataType`]
static ref REGEX_SET: RegexSet = RegexSet::new([
r"(?i)^(true)$|^(false)$(?-i)", //BOOLEAN
r"^-?(\d+)$", //INTEGER
r"^-?((\d*\.\d+|\d+\.\d*)([eE][-+]?\d+)?|\d+([eE][-+]?\d+))$", //DECIMAL
r"^\d{4}-\d\d-\d\d$", //DATE32
r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d(?:[^\d\.].*)?$", //Timestamp(Second)
r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,3}(?:[^\d].*)?$", //Timestamp(Millisecond)
r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,6}(?:[^\d].*)?$", //Timestamp(Microsecond)
r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,9}(?:[^\d].*)?$", //Timestamp(Nanosecond)
]).unwrap();
static ref INFER_TYPE_FUNC: [fn(&str) -> bool; 8] = [
|s| s.eq_ignore_ascii_case("true") || s.eq_ignore_ascii_case("false"), // Boolean
|s| s.parse::<i64>().is_ok(), // Integer
// TODO: replace regex with functions.
|s| DECIMAL_REGEX.is_match(s), // Decimal
|s| DATE32_REGEX.is_match(s), // Date32
|s| TIMESTAMP_SECOND_REGEX.is_match(s), // Timestamp(Second)
|s| TIMESTAMP_MILLISECOND_REGEX.is_match(s), // Timestamp(Millisecond)
|s| TIMESTAMP_MICROSECOND_REGEX.is_match(s), // Timestamp(Microsecond)
|s| TIMESTAMP_NANOSECOND_REGEX.is_match(s), // Timestamp(Nanosecond)
];

static ref DECIMAL_REGEX: Regex = Regex::new(r"(^-?((\d*\.\d+|\d+\.\d*)([eE][-+]?\d+)?|\d+([eE][-+]?\d+))$)|(^-?(\d+)$)").unwrap();
static ref DATE32_REGEX: Regex = Regex::new(r"^\d{4}-\d\d-\d\d$").unwrap();
static ref TIMESTAMP_SECOND_REGEX: Regex = Regex::new(r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d(?:[^\d\.].*)?$").unwrap();
static ref TIMESTAMP_MILLISECOND_REGEX: Regex = Regex::new(r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,3}(?:[^\d].*)?$").unwrap();
static ref TIMESTAMP_MICROSECOND_REGEX: Regex = Regex::new(r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,6}(?:[^\d].*)?$").unwrap();
static ref TIMESTAMP_NANOSECOND_REGEX: Regex = Regex::new(r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,9}(?:[^\d].*)?$").unwrap();
}

/// A wrapper over `Option<Regex>` to check if the value is `NULL`.
Expand Down Expand Up @@ -214,7 +222,7 @@ impl InferredDataType {
fn update(&mut self, string: &str) {
self.packed |= if string.starts_with('"') {
1 << 8 // Utf8
} else if let Some(m) = REGEX_SET.matches(string).into_iter().next() {
} else if let Some(m) = INFER_TYPE_FUNC.iter().position(|f| f(string)) {
1 << m
} else {
1 << 8 // Utf8
Expand Down

0 comments on commit 981316f

Please sign in to comment.