feat: Allow specifying date format for parsing

crealytics · Jul 25, 2024 · 1d1a644 · 1d1a644
1 parent e50e8ea
commit 1d1a644
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ A library for querying Excel files with Apache Spark, for Spark SQL and DataFram
 
 [![Build Status](https://github.com/crealytics/spark-excel/workflows/CI/badge.svg)](https://github.com/crealytics/spark-excel/actions)
 [![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.crealytics/spark-excel_2.12/badge.svg)](https://maven-badges.herokuapp.com/maven-central/com.crealytics/spark-excel_2.12)
- 
+
 
 ## Co-maintainers wanted
 Due to personal and professional constraints, the development of this library has been rather slow.
@@ -61,7 +61,7 @@ $SPARK_HOME/bin/spark-shell --packages com.crealytics:spark-excel_2.11:<spark-ve
 * This package allows querying Excel spreadsheets as [Spark DataFrames](https://spark.apache.org/docs/latest/sql-programming-guide.html).
 * From spark-excel [0.14.0](https://github.com/crealytics/spark-excel/releases/tag/v0.14.0) (August 24, 2021), there are two implementation of spark-excel
     * Original Spark-Excel with Spark data source API 1.0
-    * Spark-Excel V2 with data source API V2.0+, which supports loading from multiple files, corrupted record handling and some improvement on handling data types. 
+    * Spark-Excel V2 with data source API V2.0+, which supports loading from multiple files, corrupted record handling and some improvement on handling data types.
       See below for further details
 
 To use V2 implementation, just change your .format from `.format("com.crealytics.spark.excel")` to `.format("excel")`.
@@ -89,6 +89,7 @@ val df = spark.read
     .option("inferSchema", "false") // Optional, default: false
     .option("addColorColumns", "true") // Optional, default: false
     .option("timestampFormat", "MM-dd-yyyy HH:mm:ss") // Optional, default: yyyy-mm-dd hh:mm:ss[.fffffffff]
+    .option("dateFormat", "yyyyMMdd") // Optional, default: yyyy-MM-dd
     .option("maxRowsInMemory", 20) // Optional, default None. If set, uses a streaming reader which can help with big files (will fail if used with xls format files)
     .option("maxByteArraySize", 2147483647) // Optional, default None. See https://poi.apache.org/apidocs/5.0/org/apache/poi/util/IOUtils.html#setByteArrayMaxOverride-int-
     .option("tempFileThreshold", 10000000) // Optional, default None. Number of bytes at which a zip entry is regarded as too large for holding in memory and the data is put in a temp file instead
@@ -224,7 +225,7 @@ spark.read
 
 
 Because folders are supported you can read/write from/to a "partitioned" folder structure, just
-the same way as csv or parquet. Note that writing partitioned structures is only 
+the same way as csv or parquet. Note that writing partitioned structures is only
 available for spark >=3.0.1
 
 ````scala

diff --git a/src/main/scala/com/crealytics/spark/excel/DataColumn.scala b/src/main/scala/com/crealytics/spark/excel/DataColumn.scala
@@ -19,7 +19,7 @@ import org.apache.poi.ss.usermodel.{Cell, CellType, DataFormatter, DateUtil}
 import org.apache.spark.sql.types._
 
 import java.math.BigDecimal
-import java.sql.Timestamp
+import java.sql.{Date, Timestamp}
 import scala.util.{Failure, Success, Try}
 
 trait DataColumn extends PartialFunction[Seq[Cell], Any] {
@@ -37,6 +37,7 @@ class HeaderDataColumn(
   treatEmptyValuesAsNulls: Boolean,
   usePlainNumberFormat: Boolean,
   parseTimestamp: String => Timestamp,
+  parseDate: String => Date,
   setErrorCellsToFallbackValues: Boolean
 ) extends DataColumn {
   def name: String = field.name
@@ -113,8 +114,13 @@ class HeaderDataColumn(
           case _ => stringValue.filter(_.trim.nonEmpty).map(parseTimestamp)
         }
       case _: DateType =>
-        if (cellType == CellType.ERROR) Some(new java.sql.Date(0))
-        else numericValue.map(n => new java.sql.Date(DateUtil.getJavaDate(n).getTime))
+        cellType match {
+          case CellType.ERROR => Some(new java.sql.Date(0))
+          case CellType.NUMERIC | CellType.FORMULA =>
+            numericValue.map(n => new Date(DateUtil.getJavaDate(n).getTime))
+          case _ => stringValue.filter(_.trim.nonEmpty).map(parseDate)
+        }
+
       case _: StringType =>
         if (cellType == CellType.ERROR) Some("")
         else stringValue.filterNot(_.isEmpty && treatEmptyValuesAsNulls)

diff --git a/src/main/scala/com/crealytics/spark/excel/DefaultSource.scala b/src/main/scala/com/crealytics/spark/excel/DefaultSource.scala
@@ -47,6 +47,7 @@ class DefaultSource extends RelationProvider with SchemaRelationProvider with Cr
       inferSheetSchema = parameters.get("inferSchema").fold(false)(_.toBoolean),
       addColorColumns = parameters.get("addColorColumns").fold(false)(_.toBoolean),
       timestampFormat = parameters.get("timestampFormat"),
+      dateFormat = parameters.get("dateFormat"),
       excerptSize = parameters.get("excerptSize").fold(10)(_.toInt),
       dataLocator = dataLocator,
       workbookReader = wbReader

diff --git a/src/main/scala/com/crealytics/spark/excel/ExcelRelation.scala b/src/main/scala/com/crealytics/spark/excel/ExcelRelation.scala
@@ -16,7 +16,7 @@
 
 package com.crealytics.spark.excel
 
-import java.sql.Timestamp
+import java.sql.{Date, Timestamp}
 import java.text.SimpleDateFormat
 
 import org.apache.poi.ss.usermodel.{Cell, CellType, DataFormatter, DateUtil, Row => _}
@@ -37,6 +37,7 @@ case class ExcelRelation(
   addColorColumns: Boolean = true,
   userSchema: Option[StructType] = None,
   timestampFormat: Option[String] = None,
+  dateFormat: Option[String] = None,
   excerptSize: Int = 10,
   workbookReader: WorkbookReader
 )(@transient val sqlContext: SQLContext)
@@ -63,6 +64,15 @@ case class ExcelRelation(
       }
       .getOrElse((stringValue: String) => Timestamp.valueOf(stringValue))
 
+  private val dateParser: String => Date = {
+    dateFormat
+      .map { fmt =>
+        val parser = new SimpleDateFormat(fmt)
+        (stringValue: String) => new java.sql.Date(parser.parse(stringValue).getTime)
+      }
+      .getOrElse((stringValue: String) => Date.valueOf(stringValue))
+  }
+
   val columnNameRegex = s"(?s)^(.*?)(_color)?$$".r.unanchored
   private def columnExtractor(column: String): SheetRow => Any = {
     val columnNameRegex(columnName, isColor) = column
@@ -179,6 +189,7 @@ case class ExcelRelation(
         treatEmptyValuesAsNulls,
         usePlainNumberFormat,
         timestampParser,
+        dateParser,
         setErrorCellsToFallbackValues
       )
     }