Skip to content

Commit

Permalink
A different take on PR #303
Browse files Browse the repository at this point in the history
Add support for trailing text after the closing quote, and EOF without a
final closing quote, for Excel compatibility. Fix a unit test and add a
RAT exclude for the sample CSV file.
  • Loading branch information
garydgregory committed Mar 12, 2024
1 parent b069c2d commit c137e80
Show file tree
Hide file tree
Showing 6 changed files with 170 additions and 63 deletions.
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
<version>67</version>
</parent>
<artifactId>commons-csv</artifactId>
<version>1.10.1-SNAPSHOT</version>
<version>1.11.0-SNAPSHOT</version>
<name>Apache Commons CSV</name>
<url>https://commons.apache.org/proper/commons-csv/</url>
<inceptionYear>2005</inceptionYear>
Expand Down Expand Up @@ -161,7 +161,7 @@
</distributionManagement>

<properties>
<commons.release.version>1.10.1</commons.release.version>
<commons.release.version>1.11.0</commons.release.version>
<commons.release.desc>(Java 8 or above)</commons.release.desc>
<!-- The RC version used in the staging repository URL. -->
<commons.rc.version>RC1</commons.rc.version>
Expand Down
5 changes: 4 additions & 1 deletion src/changes/changes.xml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@
<body>
<release version="1.10.1" date="YYYY-MM-DD" description="Feature and bug fix release (Java 8 or above)">
<!-- ADD -->
<action issue="CSV-308" type="fix" dev="ggregory" due-to="Buddhi De Silva, Gary Gregory">[Javadoc] Add example to CSVFormat#setHeaderComments() #344.</action>
<action issue="CSV-308" type="add" dev="ggregory" due-to="Buddhi De Silva, Gary Gregory">[Javadoc] Add example to CSVFormat#setHeaderComments() #344.</action>
<action type="add" dev="ggregory" due-to="DamjanJovanovic, Gary Gregory">Add and use CSVFormat#setTrailingData(boolean) in CSVFormat.EXCEL for Excel compatibility #303.</action>
<action type="add" dev="ggregory" due-to="DamjanJovanovic, Gary Gregory">Add and use CSVFormat#setLenientEof(boolean) in CSVFormat.EXCEL for Excel compatibility #303.</action>
<!-- FIX -->
<action type="fix" issue="CSV-306" dev="ggregory" due-to="Sam Ng, Bruno P. Kinoshita">Replace deprecated method in user guide, update external link #324, #325.</action>
<action type="fix" dev="ggregory" due-to="Seth Falco, Bruno P. Kinoshita">Document duplicate header behavior #309.</action>
Expand All @@ -53,6 +55,7 @@
<action type="fix" issue="CSV-311" dev="ggregory" due-to="Christian Feuersaenger, Gary Gregory">OutOfMemory for very long rows despite using column value of type Reader.</action>
<action type="fix" dev="ggregory" due-to="Gary Gregory">Use try-with-resources to manage JDBC Clob in CSVPrinter.printRecords(ResultSet).</action>
<action type="fix" dev="ggregory" due-to="Gary Gregory">JDBC Blob columns are now output as Base64 instead of Object#toString(), which usually is InputStream#toString().</action>
<action type="fix" dev="ggregory" due-to="DamjanJovanovic, Gary Gregory">Support unusual Excel use cases: Add support for trailing data after the closing quote, and EOF without a final closing quote #303.</action>
<!-- UPDATE -->
<action type="update" dev="ggregory" due-to="Gary Gregory">Bump commons-io:commons-io: from 2.11.0 to 2.15.1.</action>
<action type="update" dev="ggregory" due-to="Gary Gregory, Dependabot">Bump commons-parent from 57 to 67.</action>
Expand Down
162 changes: 116 additions & 46 deletions src/main/java/org/apache/commons/csv/CSVFormat.java
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,10 @@ public static Builder create(final CSVFormat csvFormat) {

private boolean skipHeaderRecord;

private boolean lenientEof;

private boolean trailingData;

private boolean trailingDelimiter;

private boolean trim;
Expand All @@ -267,6 +271,8 @@ private Builder(final CSVFormat csvFormat) {
this.headers = csvFormat.headers;
this.skipHeaderRecord = csvFormat.skipHeaderRecord;
this.ignoreHeaderCase = csvFormat.ignoreHeaderCase;
this.lenientEof = csvFormat.lenientEof;
this.trailingData = csvFormat.trailingData;
this.trailingDelimiter = csvFormat.trailingDelimiter;
this.trim = csvFormat.trim;
this.autoFlush = csvFormat.autoFlush;
Expand Down Expand Up @@ -689,6 +695,18 @@ public Builder setIgnoreSurroundingSpaces(final boolean ignoreSurroundingSpaces)
return this;
}

/**
* Sets whether reading end-of-file is allowed even when input is malformed, helps Excel compatibility.
*
* @param lenientEof whether reading end-of-file is allowed even when input is malformed, helps Excel compatibility.
* @return This instance.
* @since 1.11.0
*/
public Builder setLenientEof(final boolean lenientEof) {
this.lenientEof = lenientEof;
return this;
}

/**
* Sets the String to convert to and from {@code null}. No substitution occurs if {@code null}.
*
Expand Down Expand Up @@ -785,6 +803,18 @@ public Builder setSkipHeaderRecord(final boolean skipHeaderRecord) {
return this;
}

/**
* Sets whether reading trailing data is allowed in records, helps Excel compatibility.
*
* @param trailingData whether reading trailing data is allowed in records, helps Excel compatibility.
* @return This instance.
* @since 1.11.0
*/
public Builder setTrailingData(final boolean trailingData) {
this.trailingData = trailingData;
return this;
}

/**
* Sets whether to add a trailing delimiter.
*
Expand Down Expand Up @@ -914,7 +944,7 @@ public CSVFormat getFormat() {
* @see Predefined#Default
*/
public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF, null, null, null, false, false, false,
false, false, false, DuplicateHeaderMode.ALLOW_ALL);
false, false, false, DuplicateHeaderMode.ALLOW_ALL, false, false);

/**
* Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is locale-dependent, it might be necessary
Expand All @@ -935,9 +965,11 @@ public CSVFormat getFormat() {
* <li>{@code setDelimiter(',')}</li>
* <li>{@code setQuote('"')}</li>
* <li>{@code setRecordSeparator("\r\n")}</li>
* <li>{@code setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_ALL)}</li>
* <li>{@code setIgnoreEmptyLines(false)}</li>
* <li>{@code setAllowMissingColumnNames(true)}</li>
* <li>{@code setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_ALL)}</li>
* <li>{@code setTrailingData(true)}</li>
* <li>{@code setLenientEof(true)}</li>
* </ul>
* <p>
* Note: This is currently like {@link #RFC4180} plus {@link Builder#setAllowMissingColumnNames(boolean) Builder#setAllowMissingColumnNames(true)} and
Expand All @@ -950,6 +982,8 @@ public CSVFormat getFormat() {
public static final CSVFormat EXCEL = DEFAULT.builder()
.setIgnoreEmptyLines(false)
.setAllowMissingColumnNames(true)
.setTrailingData(true)
.setLenientEof(true)
.build();
// @formatter:on

Expand Down Expand Up @@ -1372,7 +1406,7 @@ private static boolean isTrimChar(final CharSequence charSequence, final int pos
*/
public static CSVFormat newFormat(final char delimiter) {
return new CSVFormat(String.valueOf(delimiter), null, null, null, null, false, false, null, null, null, null, false, false, false, false, false, false,
DuplicateHeaderMode.ALLOW_ALL);
DuplicateHeaderMode.ALLOW_ALL, false, false);
}

static String[] toStringArray(final Object[] values) {
Expand Down Expand Up @@ -1455,6 +1489,10 @@ public static CSVFormat valueOf(final String format) {

private final boolean skipHeaderRecord;

private final boolean lenientEof;

private final boolean trailingData;

private final boolean trailingDelimiter;

private final boolean trim;
Expand All @@ -1474,6 +1512,8 @@ private CSVFormat(final Builder builder) {
this.headers = builder.headers;
this.skipHeaderRecord = builder.skipHeaderRecord;
this.ignoreHeaderCase = builder.ignoreHeaderCase;
this.lenientEof = builder.lenientEof;
this.trailingData = builder.trailingData;
this.trailingDelimiter = builder.trailingDelimiter;
this.trim = builder.trim;
this.autoFlush = builder.autoFlush;
Expand All @@ -1494,22 +1534,24 @@ private CSVFormat(final Builder builder) {
* @param ignoreEmptyLines {@code true} when the parser should skip empty lines.
* @param recordSeparator the line separator to use for output.
* @param nullString the line separator to use for output.
* @param headerComments the comments to be printed by the Printer before the actual CSV data.
* @param header the header
* @param skipHeaderRecord if {@code true} the header row will be skipped
* @param allowMissingColumnNames if {@code true} the missing column names are allowed when parsing the header line
* @param ignoreHeaderCase if {@code true} header names will be accessed ignoring case when parsing input
* @param trim if {@code true} next record value will be trimmed
* @param trailingDelimiter if {@code true} the trailing delimiter wil be added before record separator (if set)
* @param autoFlush if {@code true} the underlying stream will be flushed before closing
* @param duplicateHeaderMode the behavior when handling duplicate headers
* @param headerComments the comments to be printed by the Printer before the actual CSV data..
* @param header the header.
* @param skipHeaderRecord if {@code true} the header row will be skipped.
* @param allowMissingColumnNames if {@code true} the missing column names are allowed when parsing the header line.
* @param ignoreHeaderCase if {@code true} header names will be accessed ignoring case when parsing input.
* @param trim if {@code true} next record value will be trimmed.
* @param trailingDelimiter if {@code true} the trailing delimiter wil be added before record separator (if set)..
* @param autoFlush if {@code true} the underlying stream will be flushed before closing.
* @param duplicateHeaderMode the behavior when handling duplicate headers.
* @param trailingData whether reading trailing data is allowed in records, helps Excel compatibility.
* @param lenientEof whether reading end-of-file is allowed even when input is malformed, helps Excel compatibility.
* @throws IllegalArgumentException if the delimiter is a line break character.
*/
private CSVFormat(final String delimiter, final Character quoteChar, final QuoteMode quoteMode, final Character commentStart, final Character escape,
final boolean ignoreSurroundingSpaces, final boolean ignoreEmptyLines, final String recordSeparator, final String nullString,
final Object[] headerComments, final String[] header, final boolean skipHeaderRecord, final boolean allowMissingColumnNames,
final boolean ignoreHeaderCase, final boolean trim, final boolean trailingDelimiter, final boolean autoFlush,
final DuplicateHeaderMode duplicateHeaderMode) {
final DuplicateHeaderMode duplicateHeaderMode, final boolean trailingData, final boolean lenientEof) {
this.delimiter = delimiter;
this.quoteCharacter = quoteChar;
this.quoteMode = quoteMode;
Expand All @@ -1524,6 +1566,8 @@ private CSVFormat(final String delimiter, final Character quoteChar, final Quote
this.headers = clone(header);
this.skipHeaderRecord = skipHeaderRecord;
this.ignoreHeaderCase = ignoreHeaderCase;
this.lenientEof = lenientEof;
this.trailingData = trailingData;
this.trailingDelimiter = trailingDelimiter;
this.trim = trim;
this.autoFlush = autoFlush;
Expand Down Expand Up @@ -1571,18 +1615,23 @@ public boolean equals(final Object obj) {
if (this == obj) {
return true;
}
if (obj == null || getClass() != obj.getClass()) {
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final CSVFormat other = (CSVFormat) obj;
return duplicateHeaderMode == other.duplicateHeaderMode && allowMissingColumnNames == other.allowMissingColumnNames &&
autoFlush == other.autoFlush && Objects.equals(commentMarker, other.commentMarker) && Objects.equals(delimiter, other.delimiter) &&
Objects.equals(escapeCharacter, other.escapeCharacter) && Arrays.equals(headers, other.headers) &&
Arrays.equals(headerComments, other.headerComments) && ignoreEmptyLines == other.ignoreEmptyLines &&
ignoreHeaderCase == other.ignoreHeaderCase && ignoreSurroundingSpaces == other.ignoreSurroundingSpaces &&
Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) && quoteMode == other.quoteMode &&
Objects.equals(quotedNullString, other.quotedNullString) && Objects.equals(recordSeparator, other.recordSeparator) &&
skipHeaderRecord == other.skipHeaderRecord && trailingDelimiter == other.trailingDelimiter && trim == other.trim;
return allowMissingColumnNames == other.allowMissingColumnNames && autoFlush == other.autoFlush &&
Objects.equals(commentMarker, other.commentMarker) && Objects.equals(delimiter, other.delimiter) &&
duplicateHeaderMode == other.duplicateHeaderMode && Objects.equals(escapeCharacter, other.escapeCharacter) &&
Arrays.equals(headerComments, other.headerComments) && Arrays.equals(headers, other.headers) &&
ignoreEmptyLines == other.ignoreEmptyLines && ignoreHeaderCase == other.ignoreHeaderCase &&
ignoreSurroundingSpaces == other.ignoreSurroundingSpaces && lenientEof == other.lenientEof &&
Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) &&
quoteMode == other.quoteMode && Objects.equals(quotedNullString, other.quotedNullString) &&
Objects.equals(recordSeparator, other.recordSeparator) && skipHeaderRecord == other.skipHeaderRecord &&
trailingData == other.trailingData && trailingDelimiter == other.trailingDelimiter && trim == other.trim;
}

private void escape(final char c, final Appendable appendable) throws IOException {
Expand Down Expand Up @@ -1808,6 +1857,16 @@ public boolean getIgnoreSurroundingSpaces() {
return ignoreSurroundingSpaces;
}

/**
* Gets whether reading end-of-file is allowed even when input is malformed, helps Excel compatibility.
*
* @return whether reading end-of-file is allowed even when input is malformed, helps Excel compatibility.
* @since 1.11.0
*/
public boolean getLenientEof() {
return lenientEof;
}

/**
* Gets the String to convert to and from {@code null}.
* <ul>
Expand Down Expand Up @@ -1857,6 +1916,16 @@ public boolean getSkipHeaderRecord() {
return skipHeaderRecord;
}

/**
* Gets whether reading trailing data is allowed in records, helps Excel compatibility.
*
* @return whether reading trailing data is allowed in records, helps Excel compatibility.
* @since 1.11.0
*/
public boolean getTrailingData() {
return trailingData;
}

/**
* Gets whether to add a trailing delimiter.
*
Expand All @@ -1881,11 +1950,12 @@ public boolean getTrim() {
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + Arrays.hashCode(headers);
result = prime * result + Arrays.hashCode(headerComments);
return prime * result + Objects.hash(duplicateHeaderMode, allowMissingColumnNames, autoFlush, commentMarker, delimiter, escapeCharacter,
ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, nullString, quoteCharacter, quoteMode, quotedNullString, recordSeparator,
skipHeaderRecord, trailingDelimiter, trim);
result = prime * result + Arrays.hashCode(headers);
result = prime * result + Objects.hash(allowMissingColumnNames, autoFlush, commentMarker, delimiter, duplicateHeaderMode, escapeCharacter,
ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, lenientEof, nullString, quoteCharacter, quoteMode, quotedNullString,
recordSeparator, skipHeaderRecord, trailingData, trailingDelimiter, trim);
return result;
}

/**
Expand Down Expand Up @@ -2006,6 +2076,26 @@ public CSVPrinter print(final File out, final Charset charset) throws IOExceptio
return new CSVPrinter(new OutputStreamWriter(new FileOutputStream(out), charset), this);
}

private void print(final InputStream inputStream, final Appendable out, final boolean newRecord) throws IOException {
// InputStream is never null here
// There is nothing to escape when quoting is used which is the default.
if (!newRecord) {
append(getDelimiterString(), out);
}
final boolean quoteCharacterSet = isQuoteCharacterSet();
if (quoteCharacterSet) {
append(getQuoteCharacter().charValue(), out);
}
// Stream the input to the output without reading or holding the whole value in memory.
// AppendableOutputStream cannot "close" an Appendable.
try (OutputStream outputStream = new Base64OutputStream(new AppendableOutputStream<>(out))) {
IOUtils.copy(inputStream, outputStream);
}
if (quoteCharacterSet) {
append(getQuoteCharacter().charValue(), out);
}
}

/**
* Prints the {@code value} as the next value on the line to {@code out}. The value will be escaped or encapsulated as needed. Useful when one wants to
* avoid creating CSVPrinters. Trims the value if {@link #getTrim()} is true.
Expand Down Expand Up @@ -2081,26 +2171,6 @@ public CSVPrinter print(final Path out, final Charset charset) throws IOExceptio
return print(Files.newBufferedWriter(out, charset));
}

private void print(final InputStream inputStream, final Appendable out, final boolean newRecord) throws IOException {
// InputStream is never null here
// There is nothing to escape when quoting is used which is the default.
if (!newRecord) {
append(getDelimiterString(), out);
}
final boolean quoteCharacterSet = isQuoteCharacterSet();
if (quoteCharacterSet) {
append(getQuoteCharacter().charValue(), out);
}
// Stream the input to the output without reading or holding the whole value in memory.
// AppendableOutputStream cannot "close" an Appendable.
try (OutputStream outputStream = new Base64OutputStream(new AppendableOutputStream<>(out))) {
IOUtils.copy(inputStream, outputStream);
}
if (quoteCharacterSet) {
append(getQuoteCharacter().charValue(), out);
}
}

private void print(final Reader reader, final Appendable out, final boolean newRecord) throws IOException {
// Reader is never null here
if (!newRecord) {
Expand Down
Loading

0 comments on commit c137e80

Please sign in to comment.