Skip to content

Commit

Permalink
Merge pull request #234 from WorksApplications/feature/printdict
Browse files Browse the repository at this point in the history
fix DictionaryPrinter
  • Loading branch information
mh-northlander authored Oct 24, 2024
2 parents 3c1781f + 3bddd27 commit 297aace
Show file tree
Hide file tree
Showing 8 changed files with 421 additions and 79 deletions.
8 changes: 6 additions & 2 deletions docs/user_dict.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,15 +134,19 @@ Sudachiでは、以下の文字正規化を行っています。挙動の詳細
見出し表記の正規化表記を記述します。
「見出し表記=正規化表記」の場合は、見出し表記を記述します。

### 13 辞書形ID
### 13 辞書形情報

活用のある語に対して、その語の辞書形(終止形表記)を指定するための情報です。
ユーザー辞書内の語のみ指定可能です。
語のIDまたは語情報を記述します。

ユーザー辞書ソースの行数(0始まりで何行目か)がIDです。
対象となる語の辞書形のIDを記述します。

よって、辞書形IDの情報をつけた後、ファイル内の行の並びが変わるような変更(ソートや挿入など)は加えないでください。

語情報は語の見出し (解析結果表示用)、品詞1-4、品詞 (活用型)、品詞 (活用形)、読みを "," (カンマ) で区切った文字列です。
語情報を記述するときは分割情報のフィールド全体を " (ダブルクォーテーション) で囲む必要があります。

なお、活用のない語については、このフィールドは、"*"(半角アスタリスク)を記入しておいてください。

### 14 分割タイプ
Expand Down
7 changes: 3 additions & 4 deletions src/main/java/com/worksap/nlp/sudachi/SudachiCommandLine.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,15 @@
public class SudachiCommandLine {
static Logger logger = Logger.getLogger(SudachiCommandLine.class.getName());

static class FileOrStdoutPrintStream extends PrintStream {

public static class FileOrStdoutPrintStream extends PrintStream {
private boolean isFile;

FileOrStdoutPrintStream() {
public FileOrStdoutPrintStream() {
super(System.out, true);
isFile = false;
}

FileOrStdoutPrintStream(String fileName) throws FileNotFoundException, UnsupportedEncodingException {
public FileOrStdoutPrintStream(String fileName) throws FileNotFoundException, UnsupportedEncodingException {
super(new FileOutputStream(fileName), false, "UTF-8");
isFile = true;
}
Expand Down
227 changes: 171 additions & 56 deletions src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,53 +22,149 @@
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.io.Console;

import com.worksap.nlp.sudachi.WordId;
import com.worksap.nlp.sudachi.SudachiCommandLine.FileOrStdoutPrintStream;

public class DictionaryPrinter {
private final PrintStream output;
private final GrammarImpl grammar;
private final LexiconSet lexicon;
private final List<String> posStrings;
private final boolean isUser;
private final int entrySize;

DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictionary base) {
if (dic.getDictionaryHeader().isUserDictionary() && base == null) {
throw new IllegalArgumentException("System dictionary is required to print user dictionary");
}

private DictionaryPrinter() {
}
this.output = output;

static void printDictionary(String filename, BinaryDictionary systemDict, PrintStream output) throws IOException {
GrammarImpl grammar = null;
if (base == null) {
isUser = false;
grammar = dic.getGrammar();
lexicon = new LexiconSet(dic.getLexicon(), grammar.getSystemPartOfSpeechSize());
} else {
isUser = true;
grammar = base.getGrammar();
lexicon = new LexiconSet(base.getLexicon(), grammar.getSystemPartOfSpeechSize());

try (BinaryDictionary dictionary = new BinaryDictionary(filename)) {
if (dictionary.getDictionaryHeader().isSystemDictionary()) {
grammar = dictionary.getGrammar();
} else if (systemDict == null) {
throw new IllegalArgumentException("the system dictionary is not specified");
} else {
grammar = systemDict.getGrammar();
if (DictionaryVersion.hasGrammar(dictionary.getDictionaryHeader().getVersion())) {
grammar.addPosList(dictionary.getGrammar());
}
lexicon.add(dic.getLexicon(), (short) grammar.getPartOfSpeechSize());
if (DictionaryVersion.hasGrammar(dic.getDictionaryHeader().getVersion())) {
grammar.addPosList(dic.getGrammar());
}
}

List<String> posStrings = new ArrayList<>();
for (short pid = 0; pid < grammar.getPartOfSpeechSize(); pid++) {
posStrings.add(String.join(",", grammar.getPartOfSpeechString(pid)));
}
List<String> poss = new ArrayList<>();
for (short pid = 0; pid < grammar.getPartOfSpeechSize(); pid++) {
poss.add(String.join(",", grammar.getPartOfSpeechString(pid)));
}
this.posStrings = poss;

this.entrySize = dic.getLexicon().size();
}

private static void printUsage() {
Console console = System.console();
console.printf("usage: PrintDictionary [-o file] [-s file] file\n");
console.printf("\t-o file\toutput to file\n");
console.printf("\t-s file\tsystem dictionary\n");
}

Lexicon lexicon = dictionary.getLexicon();
for (int wordId = 0; wordId < lexicon.size(); wordId++) {
short leftId = lexicon.getLeftId(wordId);
short rightId = lexicon.getRightId(wordId);
short cost = lexicon.getCost(wordId);
WordInfo wordInfo = lexicon.getWordInfo(wordId);

char unitType = getUnitType(wordInfo);

output.println(String.format("%s,%d,%d,%d,%s,%s,%s,%s,%s,%c,%s,%s,%s", wordInfo.getSurface(), leftId,
rightId, cost, wordInfo.getSurface(), posStrings.get(wordInfo.getPOSId()),
wordInfo.getReadingForm(), wordInfo.getNormalizedForm(),
wordIdToString(wordInfo.getDictionaryFormWordId()), unitType,
splitToString(wordInfo.getAunitSplit()), splitToString(wordInfo.getBunitSplit()),
splitToString(wordInfo.getWordStructure())));
void printEntries() {
int dic = isUser ? 1 : 0;
for (int wordId = 0; wordId < entrySize; wordId++) {
printEntry(WordId.make(dic, wordId));
}
}

private void printEntry(int wordId) {
short leftId = lexicon.getLeftId(wordId);
short rightId = lexicon.getRightId(wordId);
short cost = lexicon.getCost(wordId);
WordInfo wordInfo = lexicon.getWordInfo(wordId);

field(maybeEscapeString(wordInfo.getSurface()));
field(leftId);
field(rightId);
field(cost);
field(maybeEscapeString(wordInfo.getSurface()));
field(posStrings.get(wordInfo.getPOSId()));
field(maybeEscapeString(wordInfo.getReadingForm()));
field(maybeEscapeString(wordInfo.getNormalizedForm()));
field(wordRefToString(wordInfo.getDictionaryFormWordId()));
field(getUnitType(wordInfo));
field(splitToString(wordInfo.getAunitSplit()));
field(splitToString(wordInfo.getBunitSplit()));
field(splitToString(wordInfo.getWordStructure()));
lastField(synonymIdList(wordInfo.getSynonymGoupIds()));
output.print("\n");
}

void field(short value) {
output.print(value);
output.print(',');
}

void field(char value) {
output.print(value);
output.print(',');
}

void field(String value) {
output.print(value);
output.print(',');
}

void lastField(String value) {
output.print(value);
}

String synonymIdList(int[] ints) {
if (ints.length == 0) {
return "*";
}
return String.join("/",
Arrays.stream(ints).boxed().map(i -> String.format("%06d", i)).collect(Collectors.toList()));
}

private static boolean hasCh(String value, int ch) {
return value.indexOf(ch) != -1;
}

/** escape string field of csv. */
private String maybeEscapeString(String value) {
boolean hasCommas = hasCh(value, ',');
boolean hasQuotes = hasCh(value, '"');
if (!hasCommas && !hasQuotes) {
return value;
}
return unicodeEscape(value, Arrays.asList('"', ','));
}

/** escape specified (ascii) chars as unicode codepoint */
private String unicodeEscape(String value, List<Character> targetChars) {
StringBuilder sb = new StringBuilder(value.length() + 10);
int len = value.length();
for (int i = 0; i < len; ++i) {
char c = value.charAt(i);
if (targetChars.contains(c)) {
// assume all target chars are ascii
sb.append("\\u00").append(Integer.toHexString(c));
} else {
sb.append(c);
}
}
return sb.toString();
}

static String wordIdToString(int wid) {
return (wid < 0) ? "*" : Integer.toString(wid);
String wordRefToString(int wid) {
if (wid < 0) {
return "*";
}
return "\"" + wordRef(wid) + "\"";
}

static char getUnitType(WordInfo info) {
Expand All @@ -81,14 +177,20 @@ static char getUnitType(WordInfo info) {
}
}

static String splitToString(int[] split) {
private String splitToString(int[] split) {
if (split.length == 0) {
return "*";
} else {
return Arrays.stream(split)
.mapToObj(i -> (i >> 28 != 0) ? "U" + Integer.toString(i & ((1 << 28) - 1)) : Integer.toString(i))
.collect(Collectors.joining("/"));
}
return "\"" + Arrays.stream(split).mapToObj(this::wordRef).collect(Collectors.joining("/")) + "\"";
}

private String wordRef(int wordId) {
WordInfo info = lexicon.getWordInfo(wordId);
String surface = maybeEscapeString(info.getSurface());
short posId = info.getPOSId();
String pos = grammar.getPartOfSpeechString(posId).toString();
String reading = maybeEscapeString(info.getReadingForm());
return String.format("%s,%s,%s", surface, pos, reading);
}

/**
Expand All @@ -111,25 +213,38 @@ static String splitToString(int[] split) {
* if IO
*/
public static void main(String[] args) throws IOException {
BinaryDictionary systemDict = null;

try {
int i = 0;
for (i = 0; i < args.length; i++) {
if (args[i].equals("-s") && i + 1 < args.length) {
systemDict = BinaryDictionary.loadSystem(args[++i]);
} else if (args[i].equals("-h")) {
System.err.println("usage: PrintDictionary [-s file] file");
System.err.println("\t-s file\tsystem dictionary");
return;
} else {
break;
}
String systemDictPath = null;
String outputFileName = null;

int i = 0;
for (i = 0; i < args.length; i++) {
if (args[i].equals("-h")) {
printUsage();
return;
} else if (args[i].equals("-o") && i + 1 < args.length) {
outputFileName = args[++i];
} else if (args[i].equals("-s") && i + 1 < args.length) {
systemDictPath = args[++i];
} else {
break;
}
}
if (i >= args.length) {
System.console().printf("target dictionary file is missing");
return;
}

if (i < args.length) {
printDictionary(args[i], systemDict, System.out);
String dictPath = args[i];
BinaryDictionary systemDict = null;
try (BinaryDictionary dict = new BinaryDictionary(dictPath);
PrintStream output = outputFileName == null ? new FileOrStdoutPrintStream()
: new FileOrStdoutPrintStream(outputFileName);) {
if (systemDictPath != null) {
systemDict = BinaryDictionary.loadSystem(systemDictPath);
}

DictionaryPrinter printer = new DictionaryPrinter(output, dict, systemDict);
printer.printEntries();
} finally {
if (systemDict != null) {
systemDict.close();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ WordEntry parseLine(List<String> cols) {
POS pos = new POS(cols.get(5), cols.get(6), cols.get(7), cols.get(8), cols.get(9), cols.get(10));
short posId = posTable.getId(pos);

entry.dictionaryFormString = cols.get(13);
entry.aUnitSplitString = cols.get(15);
entry.bUnitSplitString = cols.get(16);
entry.wordStructureString = cols.get(17);
Expand All @@ -131,14 +132,57 @@ WordEntry parseLine(List<String> cols) {

entry.wordInfo = new WordInfo(cols.get(4), // headword
(short) cols.get(0).getBytes(StandardCharsets.UTF_8).length, posId, cols.get(12), // normalizedForm
(cols.get(13).equals("*") ? -1 : Integer.parseInt(cols.get(13))), // dictionaryFormWordId
"", // dummy
-1, "", // dictioanyForm (dummy)
cols.get(11), // readingForm
null, null, null, synonymGids);

return entry;
}

/**
* Parse dictionary_form string in lexicon as referring word id.
*
* Allow id (line no) or triple format. Forbid to refer word outside this
* lexicon (user word cannot use system word as dictionary form).
*/
int parseDictionaryForm(String str) {
if (str.equals("*")) {
return -1;
}
try {
return Integer.parseInt(str);
} catch (NumberFormatException e) {
// noop
}
int wordId = wordToId(str);
if (wordId < 0) {
throw new IllegalArgumentException("couldn't find " + str + " in the dictionaries");
}
if (!wordRefMatches(str, wordId)) {
throw new IllegalArgumentException("dictionary form must exist in the same lexicon");
}
return wordId;
}

/**
* Check if wordRef matches to the word in this lexicon.
*/
boolean wordRefMatches(String ref, int wordId) {
if (wordId < 0 || entries.size() <= wordId) {
return false;
}
WordEntry entry = entries.get(wordId);

String[] cols = ref.split(",", 8);
String headword = unescape(cols[0]);
POS pos = new POS(Arrays.copyOfRange(cols, 1, 7));
short posId = posTable.getId(pos);
String reading = unescape(cols[7]);

return headword.equals(entry.headword) && posId == entry.wordInfo.getPOSId()
&& reading.equals(entry.wordInfo.getReadingForm());
}

int[] parseSynonymGids(String str) {
if (str.equals("*")) {
return new int[0];
Expand Down Expand Up @@ -246,7 +290,7 @@ public void writeTo(ModelOutput output) throws IOException {
buffer.putLength(wi.getLength());
buffer.putShort(wi.getPOSId());
buffer.putEmptyIfEqual(wi.getNormalizedForm(), wi.getSurface());
buffer.putInt(wi.getDictionaryFormWordId());
buffer.putInt(parseDictionaryForm(entry.dictionaryFormString));
buffer.putEmptyIfEqual(wi.getReadingForm(), wi.getSurface());
buffer.putInts(parseSplitInfo(entry.aUnitSplitString));
buffer.putInts(parseSplitInfo(entry.bUnitSplitString));
Expand Down Expand Up @@ -277,6 +321,7 @@ public void setLimits(int left, int right) {
public static class WordEntry {
String headword;
WordInfo wordInfo;
String dictionaryFormString;
String aUnitSplitString;
String bUnitSplitString;
String wordStructureString;
Expand Down
Loading

0 comments on commit 297aace

Please sign in to comment.