From ec1f32f9b59c19bcfef84d2c09fe33a97f976e58 Mon Sep 17 00:00:00 2001 From: CookiePieWw Date: Mon, 30 Sep 2024 20:18:12 +0800 Subject: [PATCH 1/2] refactor: detect overflow for type inference --- arrow-csv/src/reader/mod.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 36f80ec90a95..3f9371f199df 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -215,7 +215,12 @@ impl InferredDataType { self.packed |= if string.starts_with('"') { 1 << 8 // Utf8 } else if let Some(m) = REGEX_SET.matches(string).into_iter().next() { - 1 << m + if m == 1 && string.len() >= 19 && string.parse::().is_err() { + // if overflow i64, fallback to f64 + 1 << 2 + } else { + 1 << m + } } else { 1 << 8 // Utf8 } From 14e5353c9964aea14143a7cfd56e5dc07282c7b8 Mon Sep 17 00:00:00 2001 From: CookiePieWw Date: Wed, 2 Oct 2024 00:37:34 +0800 Subject: [PATCH 2/2] chore: fallback to utf8 and tests --- arrow-csv/src/reader/mod.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 3f9371f199df..d81f1afee8ee 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -216,8 +216,8 @@ impl InferredDataType { 1 << 8 // Utf8 } else if let Some(m) = REGEX_SET.matches(string).into_iter().next() { if m == 1 && string.len() >= 19 && string.parse::().is_err() { - // if overflow i64, fallback to f64 - 1 << 2 + // if overflow i64, fallback to utf8 + 1 << 8 } else { 1 << m } @@ -1824,6 +1824,8 @@ mod tests { infer_field_schema("2021-12-19T13:12:30.123456789"), DataType::Timestamp(TimeUnit::Nanosecond, None) ); + assert_eq!(infer_field_schema("–9223372036854775809"), DataType::Utf8); + assert_eq!(infer_field_schema("9223372036854775808"), DataType::Utf8); } #[test]