Skip to content

Commit

Permalink
let lexcon csv have either (or both) pos-id or pos-parts column.
Browse files Browse the repository at this point in the history
  • Loading branch information
mh-northlander committed Aug 2, 2024
1 parent 1f3f350 commit 4c5b3f0
Showing 1 changed file with 68 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ public class RawLexiconReader {
* reordered with respect to the header.
*/
public enum Column {
Surface(true), LeftId(true), RightId(true), Cost(true), Writing(false), Pos1(true), Pos2(true), Pos3(
true), Pos4(true), Pos5(true), Pos6(true), ReadingForm(true), NormalizedForm(true), DictionaryForm(
true), Mode(false), SplitA(true), SplitB(
true), WordStructure(true), SynonymGroups(false), SplitC(false), UserData(false);
Surface(true), LeftId(true), RightId(true), Cost(true), Writing(false), Pos1(false), Pos2(false), Pos3(
false), Pos4(false), Pos5(false), Pos6(false), ReadingForm(true), NormalizedForm(true), DictionaryForm(
true), Mode(false), SplitA(true), SplitB(true), WordStructure(
true), SynonymGroups(false), SplitC(false), UserData(false), PosId(false);

private final boolean required;

Expand All @@ -54,6 +54,8 @@ public enum Column {
private final WordRef.Parser normRefParser; // for normalized form
private final WordRef.Parser dictRefParser; // for dictionary form
private final WordRef.Parser splitParser; // for splits
private boolean posIdExists = false;
private boolean posStrExists = true;

public RawLexiconReader(CSVParser parser, POSTable pos, boolean user) throws IOException {
this.parser = parser;
Expand Down Expand Up @@ -93,7 +95,7 @@ private void resolveColumnLayout() throws IOException {

outer: for (int fieldId = 0; fieldId < record.size(); ++fieldId) {
String field = record.get(fieldId).replaceAll("_", "");
for (int colId = 0; colId < record.size(); ++colId) {
for (int colId = 0; colId < remaining.size(); ++colId) {
Column col = remaining.get(colId);
if (col.name().equalsIgnoreCase(field)) {
mapping[col.ordinal()] = fieldId;
Expand All @@ -113,6 +115,20 @@ private void resolveColumnLayout() throws IOException {
}
}

this.posIdExists = mapping[Column.PosId.ordinal()] >= 0;
long numPosColumnsFound = Arrays
.asList(Column.Pos1, Column.Pos2, Column.Pos3, Column.Pos4, Column.Pos5, Column.Pos6).stream()
.filter(c -> mapping[c.ordinal()] >= 0).count();
if (numPosColumnsFound != 0 && numPosColumnsFound != POS.DEPTH) {
throw new CsvFieldException(parser.getName(), 0, "POS",
new IllegalArgumentException("Pos1 ~ Pos6 columns must appear as a set."));
}
this.posStrExists = numPosColumnsFound == POS.DEPTH;
if (!posIdExists && !posStrExists) {
throw new CsvFieldException(parser.getName(), 0, "POS",
new IllegalArgumentException("Both or either PosId column or Pos1~Pos6 columns are required."));
}

this.mapping = mapping;
}

Expand Down Expand Up @@ -198,14 +214,54 @@ private List<WordRef> getWordRefs(List<String> data, Column column, WordRef.Pars
return result;
}

/** parse specified column as WordRef. */
private WordRef getWordRef(List<String> data, Column column, WordRef.Parser refParser) {
/** parse specified column as WordRef, also checks self-reference. */
private WordRef getWordRef(List<String> data, Column column, WordRef.Parser refParser, RawWordEntry entry) {
String value = get(data, column, false);
WordRef ref;
try {
return refParser.parse(value);
ref = refParser.parse(value);
} catch (IllegalArgumentException e) {
throw new CsvFieldException(parser.getName(), parser.getRow(), column.name(), e);
}

// if parsed ref seems to refering current entry, return self-reference (null),
// because headword/triple ref may resolved to other entry.
if (ref instanceof WordRef.Headword) {
WordRef.Headword headword = (WordRef.Headword) ref;
if (headword.getHeadword().equals(entry.headword)) {
return null;
}
} else if (ref instanceof WordRef.Triple) {
WordRef.Triple triple = (WordRef.Triple) ref;
if (triple.getHeadword().equals(entry.headword) && triple.getPosId() == entry.posId
&& triple.getReading().equals(entry.reading)) {
return null;
}
}
return ref;
}

/** parse POS columns. */
private short getPos(List<String> data) {
short posId = -1;
short posStrId = -1;

if (this.posIdExists) {
posId = getShort(data, Column.PosId);
}
if (this.posStrExists) {
POS pos = new POS(
// comment for line break
get(data, Column.Pos1, true), get(data, Column.Pos2, true), get(data, Column.Pos3, true),
get(data, Column.Pos4, true), get(data, Column.Pos5, true), get(data, Column.Pos6, true));
posStrId = posTable.getId(pos);
}
if (this.posIdExists && this.posStrExists && posId != posStrId) {
throw new CsvFieldException(parser.getName(), parser.getRow(), "POS", new IllegalArgumentException(
String.format("PosId (%d) and id from Pos1-6 (%d) does not match.", posId, posStrId)));
}

return this.posIdExists ? posId : posStrId;
}

/** convert csv row to RawWordEntry */
Expand All @@ -218,22 +274,11 @@ private RawWordEntry convertEntry(List<String> data) {
entry.cost = getShort(data, Column.Cost);

entry.reading = get(data, Column.ReadingForm, true);
WordRef normalizedForm = getWordRef(data, Column.NormalizedForm, normRefParser);
if (normalizedForm instanceof WordRef.Headword
&& ((WordRef.Headword) normalizedForm).getHeadword().equals(entry.headword)) {
// mark as self-reference (headword ref may point different entry)
entry.normalizedForm = null;
} else {
entry.normalizedForm = normalizedForm;
}
entry.dictionaryForm = getWordRef(data, Column.DictionaryForm, dictRefParser);

POS pos = new POS(
// comment for line break
get(data, Column.Pos1, true), get(data, Column.Pos2, true), get(data, Column.Pos3, true),
get(data, Column.Pos4, true), get(data, Column.Pos5, true), get(data, Column.Pos6, true));
entry.posId = getPos(data);

entry.posId = posTable.getId(pos);
// headword, pos, reading must be parsed before these.
entry.normalizedForm = getWordRef(data, Column.NormalizedForm, normRefParser, entry);
entry.dictionaryForm = getWordRef(data, Column.DictionaryForm, dictRefParser, entry);

entry.mode = get(data, Column.Mode, false);
entry.aUnitSplit = getWordRefs(data, Column.SplitA, splitParser);
Expand Down

0 comments on commit 4c5b3f0

Please sign in to comment.