Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enabled CLI to read predefined and customized CSV files #480

Merged
merged 10 commits into from
Jan 3, 2022
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cli/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ dependencies {
implementation 'net.sf.jopt-simple:jopt-simple:[5.0,6.0)'
implementation 'org.jetbrains.kotlin:kotlin-stdlib-jdk8'

implementation 'org.apache.commons:commons-csv:1.8'

testImplementation 'org.junit.vintage:junit-vintage-engine:5.7.0'
}

Expand Down
35 changes: 30 additions & 5 deletions cli/src/org/partiql/cli/functions/ReadFile.kt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package org.partiql.cli.functions

import com.amazon.ion.*
import org.apache.commons.csv.CSVFormat
import org.partiql.lang.eval.*
import org.partiql.lang.eval.io.*
import org.partiql.lang.eval.io.DelimitedValues.ConversionMode
Expand All @@ -28,14 +29,30 @@ internal class ReadFile(valueFactory: ExprValueFactory) : BaseFunction(valueFact
ConversionMode.values().find { it.name.toLowerCase() == name } ?:
throw IllegalArgumentException( "Unknown conversion: $name")

private fun delimitedReadHandler(delimiter: Char): (InputStream, IonStruct) -> ExprValue = { input, options ->
private fun fileReadHandler(csvFormat: CSVFormat): (InputStream, IonStruct) -> ExprValue = { input, options ->
val encoding = options["encoding"]?.stringValue() ?: "UTF-8"
val reader = InputStreamReader(input, encoding)
val conversion = options["conversion"]?.stringValue() ?: "none"

val hasHeader = options["header"]?.booleanValue() ?: false
val ignoreEmptyLine = options["ignore_empty_line"]?.booleanValue() ?: true
val ignoreSurroundingSpace = options["ignore_surrounding_space"]?.booleanValue() ?: true
val trim = options["trim"]?.booleanValue() ?: true
val delimiter = options["delimiter"]?.stringValue()?.first() // CSVParser library only accepts a single character as delimiter
val record = options["line_breaker"]?.stringValue()
val escape = options["escape"]?.stringValue()?.first() // CSVParser library only accepts a single character as escape
val quote = options["quote"]?.stringValue()?.first() // CSVParser library only accepts a single character as quote

val reader = InputStreamReader(input, encoding)
val csvFormat = csvFormat.let{ it.withIgnoreEmptyLines(ignoreEmptyLine) }
alancai98 marked this conversation as resolved.
Show resolved Hide resolved
.let{ it.withIgnoreSurroundingSpaces(ignoreSurroundingSpace) }
.let{ it.withTrim(trim) }
.let { if (hasHeader) it.withFirstRecordAsHeader() else it }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(sorry missed this in the initial review) nit: .let calls on these lines are redundant

.let { if (delimiter != null) it.withDelimiter(delimiter) else it }
.let { if (record != null) it.withRecordSeparator(record) else it }
.let { if (escape != null) it.withEscape(escape) else it }
.let { if (quote != null) it.withQuote(quote) else it }

DelimitedValues.exprValue(valueFactory, reader, delimiter, hasHeader, conversionModeFor(conversion))
DelimitedValues.exprValue(valueFactory, reader, csvFormat, conversionModeFor(conversion))
}

private fun ionReadHandler(): (InputStream, IonStruct) -> ExprValue = { input, _ ->
Expand All @@ -44,8 +61,16 @@ internal class ReadFile(valueFactory: ExprValueFactory) : BaseFunction(valueFact

private val readHandlers = mapOf(
"ion" to ionReadHandler(),
"tsv" to delimitedReadHandler('\t'),
"csv" to delimitedReadHandler(','))
"csv" to fileReadHandler(CSVFormat.DEFAULT),
"tsv" to fileReadHandler(CSVFormat.DEFAULT.withDelimiter('\t')),
"excel_csv" to fileReadHandler(CSVFormat.EXCEL),
"mysql_csv" to fileReadHandler(CSVFormat.MYSQL),
"mongodb_csv" to fileReadHandler(CSVFormat.MONGODB_CSV),
"mongodb_tsv" to fileReadHandler(CSVFormat.MONGODB_TSV),
"postgresql_csv" to fileReadHandler(CSVFormat.POSTGRESQL_CSV),
"postgresql_text" to fileReadHandler(CSVFormat.POSTGRESQL_TEXT),
"customized" to fileReadHandler(CSVFormat.DEFAULT)
)

override fun call(env: Environment, args: List<ExprValue>): ExprValue {
val options = optionsStruct(1, args)
Expand Down
96 changes: 96 additions & 0 deletions cli/test/org/partiql/cli/functions/ReadFileTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -151,4 +151,100 @@ class ReadFileTest {

assertEquals(ion.singleValue(expected), actual)
}

@Test
fun readExcelCsvFile() {
writeFile("simple_excel.csv", "title,category,price\nharry potter,book,7.99")

val args = listOf("\"${dirPath("simple_excel.csv")}\"", "{type:\"excel_csv\", header:true}").map { it.exprValue() }

val actual = function.call(env, args).ionValue
val expected = "[{title:\"harry potter\",category:\"book\",price:\"7.99\"}]"

assertEquals(ion.singleValue(expected), actual)
}

@Test
fun readPostgreSQLCsvFile() {
writeFile("simple_postgresql.csv", "id,name,balance\n1,Bob,10000.00")

val args = listOf("\"${dirPath("simple_postgresql.csv")}\"", "{type:\"postgresql_csv\", header:true}").map { it.exprValue() }

val actual = function.call(env, args).ionValue
val expected = "[{id:\"1\",name:\"Bob\",balance:\"10000.00\"}]"

assertEquals(ion.singleValue(expected), actual)
}

@Test
fun readCustomizedCsvFile1() { // delimiter
writeFile("customized.csv", "id name balance\n1 Bob 10000.00")

val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, delimiter:' '}").map { it.exprValue() }

val actual = function.call(env, args).ionValue
val expected = "[{id:\"1\",name:\"Bob\",balance:\"10000.00\"}]"

assertEquals(ion.singleValue(expected), actual)
}

@Test
fun readCustomizedCsvFile2() { // ignore_empty_line
writeFile("customized.csv", "id,name,balance\n\n1,Bob,10000.00")

val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, ignore_empty_line: false}").map { it.exprValue() }

val actual = function.call(env, args).ionValue
val expected = "[{id:\"\"},{id:\"1\",name:\"Bob\",balance:\"10000.00\"}]"

assertEquals(ion.singleValue(expected), actual)
}

@Test
fun readCustomizedCsvFile3() { // trim and ignore_surrounding_space
writeFile("customized.csv", "id,name,balance\n 1 , Bob , 10000.00 ")

val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, ignore_surrounding_space:false, trim:false}").map { it.exprValue() }

val actual = function.call(env, args).ionValue
val expected = "[{id:\" 1 \",name:\" Bob \",balance:\" 10000.00 \"}]"

assertEquals(ion.singleValue(expected), actual)
}
alancai98 marked this conversation as resolved.
Show resolved Hide resolved

@Test
fun readCustomizedCsvFile4() { // line_breaker
writeFile("customized.csv", "id,name,balance\r\n1,Bob,10000.00")

val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, line_breaker:'\\\r\\\n'}").map { it.exprValue() }

val actual = function.call(env, args).ionValue
val expected = "[{id:\"1\",name:\"Bob\",balance:\"10000.00\"}]"

assertEquals(ion.singleValue(expected), actual)
}

@Test
fun readCustomizedCsvFile5() { // escape
writeFile("customized.csv", "id,name,balance\n\"/\"1\",Bob,10000.00")

val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, escape:'/'}").map { it.exprValue() }

val actual = function.call(env, args).ionValue
val expected = "[{id:\"\\\"1\",name:\"Bob\",balance:\"10000.00\"}]"

assertEquals(ion.singleValue(expected), actual)
}

@Test
fun readCustomizedCsvFile6() { // quote
writeFile("customized.csv", "id,name,balance\n'1,',Bob,10000.00")

val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, quote:\"'\"}").map { it.exprValue() }

val actual = function.call(env, args).ionValue
val expected = "[{id:\"1,\",name:\"Bob\",balance:\"10000.00\"}]"

assertEquals(ion.singleValue(expected), actual)
}
}
32 changes: 32 additions & 0 deletions docs/user/CLI.md
Original file line number Diff line number Diff line change
Expand Up @@ -641,3 +641,35 @@ Kumo dog
Mochi dog
Lilikoi unicorn
```

## Predefined CSV Data

The `read_file` function provides options to read other predefined CSV data formats.
For example, if a CSV file is exported from PostgreSQL, we can use the following command
to read the file:
```
read_file('simple_postgresql.csv', {'type':'postgresql_csv'})
```
Other available options for the argument `type` besides `postgresql_csv` are `excel_csv`, `mysql_csv`, `mongodb_csv`, `mongodb_tsv`, and `postgresql_text`.

## Customized CSV Data
The `read_file` function also provides options to read customized CSV data formats.
For example, we have a data file where the whitespace is the separator as shown below:
```
title category price
harry_potter book 7.99
dot electronics 49.99
echo electronics 99.99
```
We can use the following command to read the file:
```
read_file('customized.csv', {'type':'customized', 'delimiter':' ', 'header':true})
```
All the available options for customized CSV files are shown as following:
1. Ignore empty lines: `'ignore_empty_line':true`
2. Ignore spaces surrounding comma: `'ignore_surrounding_space':true`
3. Trim leading and trailing blanks: `'trim':true`
4. Set line breaker (only working with '\r', '\n' and '\r\n'): `'line_breaker: \n'`
5. Set escape sign (single character only): `'escape':'\'`
6. Set quote sign (single character only): `'quote':'"'`
7. Set delimiter sign (single character only): `'delimiter':','`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity, is there a reason for this set of customized CSV parsing options? I saw there were some other options CSVFormat supports.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which other options? I think these are all the options to customize the format of a CSV file.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From the CSVFormat constructor and CSVFormat.Builder (documentation here), there are about 25 total configuration options like nullString, recordSeparator, quoteMode, skipHeaderRecord, etc.

Copy link
Contributor Author

@lziq lziq Jan 3, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think other options are not so important to configure CSV format. If we find we need to configure any one of them in the future, we can create a ticket for it and make corresponding enhancement.

13 changes: 3 additions & 10 deletions lang/src/org/partiql/lang/eval/io/DelimitedValues.kt
Original file line number Diff line number Diff line change
Expand Up @@ -59,25 +59,18 @@ object DelimitedValues {
* Lazily loads a stream of values from a [Reader] into a sequence backed [ExprValue].
* This does **not** close the [Reader].
*
* @param ion The system to use.
* @param input The input source.
* @param delimiter The delimiter to use between columns.
* @param hasHeader Whether the first row of the delimited input defines the columns.
* @param csvFormat What the format of csv files is.
* @param conversionMode How column text should be converted.
*/
@JvmStatic
fun exprValue(valueFactory: ExprValueFactory,
input: Reader,
delimiter: Char,
hasHeader: Boolean,
lziq marked this conversation as resolved.
Show resolved Hide resolved
csvFormat: CSVFormat,
conversionMode: ConversionMode): ExprValue {
val reader = BufferedReader(input)
val csvFormat = when (hasHeader){
true -> CSVFormat.DEFAULT.withDelimiter(delimiter).withFirstRecordAsHeader()
false -> CSVFormat.DEFAULT.withDelimiter(delimiter)
}
val csvParser = CSVParser(reader, csvFormat)
val columns: List<String> = csvParser.headerNames // `columns` is an empty list when `hasHeader` is false
alancai98 marked this conversation as resolved.
Show resolved Hide resolved
val columns: List<String> = csvParser.headerNames

val seq = csvParser.asSequence().map { csvRecord ->
valueFactory.newStruct(
Expand Down
21 changes: 8 additions & 13 deletions lang/test/org/partiql/lang/eval/io/DelimitedValuesTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

package org.partiql.lang.eval.io

import org.apache.commons.csv.CSVFormat
import org.partiql.lang.*
import org.partiql.lang.eval.io.DelimitedValues.ConversionMode
import org.partiql.lang.eval.io.DelimitedValues.ConversionMode.*
Expand All @@ -32,10 +33,9 @@ class DelimitedValuesTest : TestBase() {
}

private fun read(text: String,
delimiter: Char,
hasHeader: Boolean,
csvFormat: CSVFormat,
conversionMode: ConversionMode): ExprValue =
DelimitedValues.exprValue(valueFactory, StringReader(text), delimiter, hasHeader, conversionMode)
DelimitedValues.exprValue(valueFactory, StringReader(text), csvFormat, conversionMode)

private fun assertWrite(expectedText: String,
valueText: String,
Expand Down Expand Up @@ -78,8 +78,7 @@ class DelimitedValuesTest : TestBase() {
"""[]""",
read(
"",
delimiter = ',',
hasHeader = false,
CSVFormat.DEFAULT,
conversionMode = NONE
)
)
Expand All @@ -89,8 +88,7 @@ class DelimitedValuesTest : TestBase() {
"""[]""",
read(
"",
delimiter = ',',
hasHeader = false,
CSVFormat.DEFAULT,
conversionMode = AUTO
)
)
Expand All @@ -100,8 +98,7 @@ class DelimitedValuesTest : TestBase() {
"""[{_1: "1", _2: "2", _3: "3"}]""",
read(
"""1,2,3""",
delimiter = ',',
hasHeader = false,
CSVFormat.DEFAULT,
conversionMode = NONE
)
)
Expand All @@ -119,8 +116,7 @@ class DelimitedValuesTest : TestBase() {
|1.0,2e0,2007-10-10T12:00:00Z
|hello,{,}
""".trimMargin(),
delimiter = ',',
hasHeader = false,
CSVFormat.DEFAULT,
conversionMode = AUTO
)
)
Expand All @@ -139,8 +135,7 @@ class DelimitedValuesTest : TestBase() {
|1.0,2e0,2007-10-10T12:00:00Z
|hello,{,}
""".trimMargin(),
delimiter = ',',
hasHeader = true,
CSVFormat.DEFAULT.withFirstRecordAsHeader(),
conversionMode = AUTO
)
)
Expand Down