Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix DictionaryPrinter #234

Merged
merged 7 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions docs/user_dict.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,15 +134,19 @@ Sudachiでは、以下の文字正規化を行っています。挙動の詳細
見出し表記の正規化表記を記述します。
「見出し表記=正規化表記」の場合は、見出し表記を記述します。

### 13 辞書形ID
### 13 辞書形情報

活用のある語に対して、その語の辞書形(終止形表記)を指定するための情報です。
ユーザー辞書内の語のみ指定可能です。
語のIDまたは語情報を記述します。

ユーザー辞書ソースの行数(0始まりで何行目か)がIDです。
対象となる語の辞書形のIDを記述します。

よって、辞書形IDの情報をつけた後、ファイル内の行の並びが変わるような変更(ソートや挿入など)は加えないでください。

語情報は語の見出し (解析結果表示用)、品詞1-4、品詞 (活用型)、品詞 (活用形)、読みを "," (カンマ) で区切った文字列です。
語情報を記述するときは分割情報のフィールド全体を " (ダブルクォーテーション) で囲む必要があります。

なお、活用のない語については、このフィールドは、"*"(半角アスタリスク)を記入しておいてください。

### 14 分割タイプ
Expand Down
7 changes: 3 additions & 4 deletions src/main/java/com/worksap/nlp/sudachi/SudachiCommandLine.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,15 @@
public class SudachiCommandLine {
static Logger logger = Logger.getLogger(SudachiCommandLine.class.getName());

static class FileOrStdoutPrintStream extends PrintStream {

public static class FileOrStdoutPrintStream extends PrintStream {
private boolean isFile;

FileOrStdoutPrintStream() {
public FileOrStdoutPrintStream() {
super(System.out, true);
isFile = false;
}

FileOrStdoutPrintStream(String fileName) throws FileNotFoundException, UnsupportedEncodingException {
public FileOrStdoutPrintStream(String fileName) throws FileNotFoundException, UnsupportedEncodingException {
super(new FileOutputStream(fileName), false, "UTF-8");
isFile = true;
}
Expand Down
227 changes: 171 additions & 56 deletions src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,53 +22,149 @@
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.io.Console;

import com.worksap.nlp.sudachi.WordId;
import com.worksap.nlp.sudachi.SudachiCommandLine.FileOrStdoutPrintStream;

public class DictionaryPrinter {
private final PrintStream output;
private final GrammarImpl grammar;
private final LexiconSet lexicon;
private final List<String> posStrings;
private final boolean isUser;
private final int entrySize;

DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictionary base) {
if (dic.getDictionaryHeader().isUserDictionary() && base == null) {
throw new IllegalArgumentException("System dictionary is required to print user dictionary");
}

private DictionaryPrinter() {
}
this.output = output;

static void printDictionary(String filename, BinaryDictionary systemDict, PrintStream output) throws IOException {
GrammarImpl grammar = null;
if (base == null) {
isUser = false;
grammar = dic.getGrammar();
lexicon = new LexiconSet(dic.getLexicon(), grammar.getSystemPartOfSpeechSize());
} else {
isUser = true;
grammar = base.getGrammar();
lexicon = new LexiconSet(base.getLexicon(), grammar.getSystemPartOfSpeechSize());

try (BinaryDictionary dictionary = new BinaryDictionary(filename)) {
if (dictionary.getDictionaryHeader().isSystemDictionary()) {
grammar = dictionary.getGrammar();
} else if (systemDict == null) {
throw new IllegalArgumentException("the system dictionary is not specified");
} else {
grammar = systemDict.getGrammar();
if (DictionaryVersion.hasGrammar(dictionary.getDictionaryHeader().getVersion())) {
grammar.addPosList(dictionary.getGrammar());
}
lexicon.add(dic.getLexicon(), (short) grammar.getPartOfSpeechSize());
if (DictionaryVersion.hasGrammar(dic.getDictionaryHeader().getVersion())) {
grammar.addPosList(dic.getGrammar());
}
}

List<String> posStrings = new ArrayList<>();
for (short pid = 0; pid < grammar.getPartOfSpeechSize(); pid++) {
posStrings.add(String.join(",", grammar.getPartOfSpeechString(pid)));
}
List<String> poss = new ArrayList<>();
for (short pid = 0; pid < grammar.getPartOfSpeechSize(); pid++) {
poss.add(String.join(",", grammar.getPartOfSpeechString(pid)));
}
this.posStrings = poss;

this.entrySize = dic.getLexicon().size();
}

private static void printUsage() {
Console console = System.console();
console.printf("usage: PrintDictionary [-o file] [-s file] file\n");
console.printf("\t-o file\toutput to file\n");
console.printf("\t-s file\tsystem dictionary\n");
}

Lexicon lexicon = dictionary.getLexicon();
for (int wordId = 0; wordId < lexicon.size(); wordId++) {
short leftId = lexicon.getLeftId(wordId);
short rightId = lexicon.getRightId(wordId);
short cost = lexicon.getCost(wordId);
WordInfo wordInfo = lexicon.getWordInfo(wordId);

char unitType = getUnitType(wordInfo);

output.println(String.format("%s,%d,%d,%d,%s,%s,%s,%s,%s,%c,%s,%s,%s", wordInfo.getSurface(), leftId,
rightId, cost, wordInfo.getSurface(), posStrings.get(wordInfo.getPOSId()),
wordInfo.getReadingForm(), wordInfo.getNormalizedForm(),
wordIdToString(wordInfo.getDictionaryFormWordId()), unitType,
splitToString(wordInfo.getAunitSplit()), splitToString(wordInfo.getBunitSplit()),
splitToString(wordInfo.getWordStructure())));
void printEntries() {
int dic = isUser ? 1 : 0;
for (int wordId = 0; wordId < entrySize; wordId++) {
printEntry(WordId.make(dic, wordId));
}
}

private void printEntry(int wordId) {
short leftId = lexicon.getLeftId(wordId);
short rightId = lexicon.getRightId(wordId);
short cost = lexicon.getCost(wordId);
WordInfo wordInfo = lexicon.getWordInfo(wordId);

field(maybeEscapeString(wordInfo.getSurface()));
field(leftId);
field(rightId);
field(cost);
field(maybeEscapeString(wordInfo.getSurface()));
field(posStrings.get(wordInfo.getPOSId()));
field(maybeEscapeString(wordInfo.getReadingForm()));
field(maybeEscapeString(wordInfo.getNormalizedForm()));
field(wordRefToString(wordInfo.getDictionaryFormWordId()));
field(getUnitType(wordInfo));
field(splitToString(wordInfo.getAunitSplit()));
field(splitToString(wordInfo.getBunitSplit()));
field(splitToString(wordInfo.getWordStructure()));
lastField(synonymIdList(wordInfo.getSynonymGoupIds()));
output.print("\n");
}

void field(short value) {
output.print(value);
output.print(',');
}

void field(char value) {
output.print(value);
output.print(',');
}

void field(String value) {
output.print(value);
output.print(',');
}

void lastField(String value) {
output.print(value);
}

String synonymIdList(int[] ints) {
if (ints.length == 0) {
return "*";
}
return String.join("/",
Arrays.stream(ints).boxed().map(i -> String.format("%06d", i)).collect(Collectors.toList()));
}

private static boolean hasCh(String value, int ch) {
return value.indexOf(ch) != -1;
}

/** escape string field of csv. */
private String maybeEscapeString(String value) {
boolean hasCommas = hasCh(value, ',');
boolean hasQuotes = hasCh(value, '"');
if (!hasCommas && !hasQuotes) {
return value;
}
return unicodeEscape(value, Arrays.asList('"', ','));
}

/** escape specified (ascii) chars as unicode codepoint */
private String unicodeEscape(String value, List<Character> targetChars) {
StringBuilder sb = new StringBuilder(value.length() + 10);
int len = value.length();
for (int i = 0; i < len; ++i) {
char c = value.charAt(i);
if (targetChars.contains(c)) {
// assume all target chars are ascii
sb.append("\\u00").append(Integer.toHexString(c));
} else {
sb.append(c);
}
}
return sb.toString();
}

static String wordIdToString(int wid) {
return (wid < 0) ? "*" : Integer.toString(wid);
String wordRefToString(int wid) {
if (wid < 0) {
return "*";
}
return "\"" + wordRef(wid) + "\"";
}

static char getUnitType(WordInfo info) {
Expand All @@ -81,14 +177,20 @@ static char getUnitType(WordInfo info) {
}
}

static String splitToString(int[] split) {
private String splitToString(int[] split) {
if (split.length == 0) {
return "*";
} else {
return Arrays.stream(split)
.mapToObj(i -> (i >> 28 != 0) ? "U" + Integer.toString(i & ((1 << 28) - 1)) : Integer.toString(i))
.collect(Collectors.joining("/"));
}
return "\"" + Arrays.stream(split).mapToObj(this::wordRef).collect(Collectors.joining("/")) + "\"";
}

private String wordRef(int wordId) {
WordInfo info = lexicon.getWordInfo(wordId);
String surface = maybeEscapeString(info.getSurface());
short posId = info.getPOSId();
String pos = grammar.getPartOfSpeechString(posId).toString();
String reading = maybeEscapeString(info.getReadingForm());
return String.format("%s,%s,%s", surface, pos, reading);
}

/**
Expand All @@ -111,25 +213,38 @@ static String splitToString(int[] split) {
* if IO
*/
public static void main(String[] args) throws IOException {
BinaryDictionary systemDict = null;

try {
int i = 0;
for (i = 0; i < args.length; i++) {
if (args[i].equals("-s") && i + 1 < args.length) {
systemDict = BinaryDictionary.loadSystem(args[++i]);
} else if (args[i].equals("-h")) {
System.err.println("usage: PrintDictionary [-s file] file");
System.err.println("\t-s file\tsystem dictionary");
return;
} else {
break;
}
String systemDictPath = null;
String outputFileName = null;

int i = 0;
for (i = 0; i < args.length; i++) {
if (args[i].equals("-h")) {
printUsage();
return;
} else if (args[i].equals("-o") && i + 1 < args.length) {
outputFileName = args[++i];
} else if (args[i].equals("-s") && i + 1 < args.length) {
systemDictPath = args[++i];
} else {
break;
}
}
if (i >= args.length) {
System.console().printf("target dictionary file is missing");
return;
}

if (i < args.length) {
printDictionary(args[i], systemDict, System.out);
String dictPath = args[i];
BinaryDictionary systemDict = null;
try (BinaryDictionary dict = new BinaryDictionary(dictPath);
PrintStream output = outputFileName == null ? new FileOrStdoutPrintStream()
: new FileOrStdoutPrintStream(outputFileName);) {
if (systemDictPath != null) {
systemDict = BinaryDictionary.loadSystem(systemDictPath);
}

DictionaryPrinter printer = new DictionaryPrinter(output, dict, systemDict);
printer.printEntries();
} finally {
if (systemDict != null) {
systemDict.close();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ WordEntry parseLine(List<String> cols) {
POS pos = new POS(cols.get(5), cols.get(6), cols.get(7), cols.get(8), cols.get(9), cols.get(10));
short posId = posTable.getId(pos);

entry.dictionaryFormString = cols.get(13);
entry.aUnitSplitString = cols.get(15);
entry.bUnitSplitString = cols.get(16);
entry.wordStructureString = cols.get(17);
Expand All @@ -131,14 +132,57 @@ WordEntry parseLine(List<String> cols) {

entry.wordInfo = new WordInfo(cols.get(4), // headword
(short) cols.get(0).getBytes(StandardCharsets.UTF_8).length, posId, cols.get(12), // normalizedForm
(cols.get(13).equals("*") ? -1 : Integer.parseInt(cols.get(13))), // dictionaryFormWordId
"", // dummy
-1, "", // dictioanyForm (dummy)
cols.get(11), // readingForm
null, null, null, synonymGids);

return entry;
}

/**
* Parse dictionary_form string in lexicon as referring word id.
*
* Allow id (line no) or triple format. Forbid to refer word outside this
* lexicon (user word cannot use system word as dictionary form).
*/
int parseDictionaryForm(String str) {
if (str.equals("*")) {
return -1;
}
try {
return Integer.parseInt(str);
} catch (NumberFormatException e) {
// noop
}
int wordId = wordToId(str);
if (wordId < 0) {
throw new IllegalArgumentException("couldn't find " + str + " in the dictionaries");
}
if (!wordRefMatches(str, wordId)) {
throw new IllegalArgumentException("dictionary form must exist in the same lexicon");
}
return wordId;
}

/**
* Check if wordRef matches to the word in this lexicon.
*/
boolean wordRefMatches(String ref, int wordId) {
if (wordId < 0 || entries.size() <= wordId) {
return false;
}
WordEntry entry = entries.get(wordId);

String[] cols = ref.split(",", 8);
String headword = unescape(cols[0]);
POS pos = new POS(Arrays.copyOfRange(cols, 1, 7));
short posId = posTable.getId(pos);
String reading = unescape(cols[7]);

return headword.equals(entry.headword) && posId == entry.wordInfo.getPOSId()
&& reading.equals(entry.wordInfo.getReadingForm());
}

int[] parseSynonymGids(String str) {
if (str.equals("*")) {
return new int[0];
Expand Down Expand Up @@ -246,7 +290,7 @@ public void writeTo(ModelOutput output) throws IOException {
buffer.putLength(wi.getLength());
buffer.putShort(wi.getPOSId());
buffer.putEmptyIfEqual(wi.getNormalizedForm(), wi.getSurface());
buffer.putInt(wi.getDictionaryFormWordId());
buffer.putInt(parseDictionaryForm(entry.dictionaryFormString));
buffer.putEmptyIfEqual(wi.getReadingForm(), wi.getSurface());
buffer.putInts(parseSplitInfo(entry.aUnitSplitString));
buffer.putInts(parseSplitInfo(entry.bUnitSplitString));
Expand Down Expand Up @@ -277,6 +321,7 @@ public void setLimits(int left, int right) {
public static class WordEntry {
String headword;
WordInfo wordInfo;
String dictionaryFormString;
String aUnitSplitString;
String bUnitSplitString;
String wordStructureString;
Expand Down
Loading
Loading