Skip to content

Commit

Permalink
[INLONG-10768][Sort] Csv utils support specified the max split field …
Browse files Browse the repository at this point in the history
…size (#10769)
  • Loading branch information
vernedeng authored Aug 11, 2024
1 parent d56f47e commit e50f45b
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,19 @@ public static String[][] splitCsv(
return splitCsv(text, delimiter, escapeChar, quoteChar, lineDelimiter, false);
}

/**
* @see StringUtils#splitCsv(String, Character, Character, Character, Character, boolean, Integer)
*/
public static String[][] splitCsv(
@Nonnull String text,
@Nonnull Character delimiter,
@Nullable Character escapeChar,
@Nullable Character quoteChar,
@Nullable Character lineDelimiter,
boolean deleteHeadDelimiter) {
return splitCsv(text, delimiter, escapeChar, quoteChar, lineDelimiter, deleteHeadDelimiter, null);
}

/**
* Splits the csv text, which may contains multiple lines of data.
*
Expand All @@ -402,6 +415,7 @@ public static String[][] splitCsv(
* @param lineDelimiter The delimiter between lines, e.g. '\n'.
* @param deleteHeadDelimiter If true and the leading character of a line
* is a delimiter, it will be ignored.
* @param maxFieldSize The max filed size of one single line
* @return A 2-D String array representing the parsed data, where the 1st
* dimension is row and the 2nd dimension is column.
*/
Expand All @@ -411,9 +425,16 @@ public static String[][] splitCsv(
@Nullable Character escapeChar,
@Nullable Character quoteChar,
@Nullable Character lineDelimiter,
boolean deleteHeadDelimiter) {
boolean deleteHeadDelimiter,
@Nullable Integer maxFieldSize) {
if (maxFieldSize != null && maxFieldSize <= 0) {
return new String[0][];
}

List<String[]> lines = new ArrayList<>();
List<String> fields = new ArrayList<>();
int splittedSize = 0;
int lastFieldStartIndex = 0;

StringBuilder stringBuilder = new StringBuilder();
int state = STATE_NORMAL;
Expand All @@ -431,6 +452,14 @@ public static String[][] splitCsv(
String field = stringBuilder.toString();
fields.add(field);
stringBuilder.setLength(0);

splittedSize++;
// if the last field, mark the last filed start index
if (maxFieldSize != null && splittedSize == maxFieldSize - 1) {
if (i + 1 < text.length()) {
lastFieldStartIndex = i + 1;
}
}
break;
case STATE_ESCAPING:
stringBuilder.append(ch);
Expand Down Expand Up @@ -471,10 +500,19 @@ public static String[][] splitCsv(
case STATE_NORMAL:
String field = stringBuilder.toString();
fields.add(field);
lines.add(fields.toArray(new String[0]));

// if the max field size < the real field size,
// remove the extra fields and copy the latest field from lastFieldStartIndex to current index
if (maxFieldSize != null && fields.size() > maxFieldSize) {
fields = replaceLastField(fields, maxFieldSize, text, lastFieldStartIndex, i);
}
// reset the lastFieldStartIndex for new line
lastFieldStartIndex = i + 1;

lines.add(fields.toArray(new String[0]));
stringBuilder.setLength(0);
fields.clear();
splittedSize = 0;
break;
case STATE_ESCAPING:
stringBuilder.append(ch);
Expand All @@ -498,6 +536,11 @@ public static String[][] splitCsv(
case STATE_QUOTING:
String field = stringBuilder.toString();
fields.add(field);

if (maxFieldSize != null && fields.size() > maxFieldSize) {
fields = replaceLastField(fields, maxFieldSize, text, lastFieldStartIndex, text.length());
}

lines.add(fields.toArray(new String[0]));

String[][] result = new String[lines.size()][];
Expand All @@ -510,6 +553,28 @@ public static String[][] splitCsv(
}
}

/**
* if the max field size < the real field size,
* remove the extra fields and copy the latest field from lastFieldStartIndex to lastFieldEndIndex
*
* @param fields Target field list
* @param maxFieldSize Specified max fieldSize
* @param text Origin text
* @param lastFieldStartIndex Start index of last field
* @param lastFieldEndIndex End index of last field
*/
private static List<String> replaceLastField(
List<String> fields,
int maxFieldSize,
String text,
int lastFieldStartIndex,
int lastFieldEndIndex) {
List<String> newField = fields.subList(0, maxFieldSize - 1);
String last = text.substring(lastFieldStartIndex, lastFieldEndIndex);
newField.add(last);
return newField;
}

/**
* Concat the given fields.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,4 +112,45 @@ public void testSplitCsvString() {
assertEquals("home", csv1Array2[2][1]);
assertEquals("home", csv1Array2[2][2]);
}

@Test
public void testSplitCsvStringWithMaxFields() {

String csvString = "name|age=20\\||&'\n\name|age=20\\||&'\n\n|home|\\home\\";
String[][] csv1Array0 = StringUtils.splitCsv(csvString, '|',
'\\', '\'', '\n', false, 0);
assertEquals(0, csv1Array0.length);

String[][] csv1Array1 = StringUtils.splitCsv(csvString, '|',
'\\', '\'', '\n', false, 1);
assertEquals("name|age=20\\||&'\n\name|age=20\\||&'", csv1Array1[0][0]);
assertEquals("", csv1Array1[1][0]);
assertEquals("|home|\\home\\", csv1Array1[2][0]);

String[][] csv1Array2 = StringUtils.splitCsv(csvString, '|',
'\\', '\'', '\n', false, 2);
assertEquals("name", csv1Array2[0][0]);
assertEquals("age=20\\||&'\n\name|age=20\\||&'", csv1Array2[0][1]);
assertEquals("", csv1Array2[1][0]);
assertEquals("", csv1Array2[2][0]);
assertEquals("home|\\home\\", csv1Array2[2][1]);

String[][] csv1Array3 = StringUtils.splitCsv(csvString, '|',
'\\', '\'', '\n', false, 3);
assertEquals("name", csv1Array3[0][0]);
assertEquals("age=20|", csv1Array3[0][1]);
assertEquals("&\n\name|age=20\\||&", csv1Array3[0][2]);
assertEquals("", csv1Array3[2][0]);
assertEquals("home", csv1Array3[2][1]);
assertEquals("home", csv1Array3[2][2]);

String[][] csv1Array4 = StringUtils.splitCsv(csvString, '|',
'\\', '\'', '\n', false, 4);
assertEquals("name", csv1Array4[0][0]);
assertEquals("age=20|", csv1Array4[0][1]);
assertEquals("&\n\name|age=20\\||&", csv1Array4[0][2]);
assertEquals("", csv1Array4[2][0]);
assertEquals("home", csv1Array4[2][1]);
assertEquals("home", csv1Array4[2][2]);
}
}

0 comments on commit e50f45b

Please sign in to comment.