Merge pull request #234 from WorksApplications/feature/printdict

fix DictionaryPrinter
WorksApplications · Oct 24, 2024 · 297aace · 297aace
2 parents 3c1781f + 3bddd27
commit 297aace
Show file tree

Hide file tree

Showing 8 changed files with 421 additions and 79 deletions.
diff --git a/docs/user_dict.md b/docs/user_dict.md
@@ -134,15 +134,19 @@ Sudachiでは、以下の文字正規化を行っています。挙動の詳細
 見出し表記の正規化表記を記述します。
 「見出し表記＝正規化表記」の場合は、見出し表記を記述します。
 
-### 13 辞書形ID
+### 13 辞書形情報
 
 活用のある語に対して、その語の辞書形（終止形表記）を指定するための情報です。
+ユーザー辞書内の語のみ指定可能です。
+語のIDまたは語情報を記述します。
 
 ユーザー辞書ソースの行数（0始まりで何行目か）がIDです。
 対象となる語の辞書形のIDを記述します。
-
 よって、辞書形IDの情報をつけた後、ファイル内の行の並びが変わるような変更（ソートや挿入など）は加えないでください。
 
+語情報は語の見出し (解析結果表示用)、品詞1-4、品詞 (活用型)、品詞 (活用形)、読みを "," (カンマ) で区切った文字列です。
+語情報を記述するときは分割情報のフィールド全体を " (ダブルクォーテーション) で囲む必要があります。
+
 なお、活用のない語については、このフィールドは、"*"（半角アスタリスク）を記入しておいてください。
 
 ### 14 分割タイプ

diff --git a/src/main/java/com/worksap/nlp/sudachi/SudachiCommandLine.java b/src/main/java/com/worksap/nlp/sudachi/SudachiCommandLine.java
@@ -43,16 +43,15 @@
 public class SudachiCommandLine {
     static Logger logger = Logger.getLogger(SudachiCommandLine.class.getName());
 
-    static class FileOrStdoutPrintStream extends PrintStream {
-
+    public static class FileOrStdoutPrintStream extends PrintStream {
         private boolean isFile;
 
-        FileOrStdoutPrintStream() {
+        public FileOrStdoutPrintStream() {
             super(System.out, true);
             isFile = false;
         }
 
-        FileOrStdoutPrintStream(String fileName) throws FileNotFoundException, UnsupportedEncodingException {
+        public FileOrStdoutPrintStream(String fileName) throws FileNotFoundException, UnsupportedEncodingException {
             super(new FileOutputStream(fileName), false, "UTF-8");
             isFile = true;
         }

diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java
@@ -22,53 +22,149 @@
 import java.util.Arrays;
 import java.util.List;
 import java.util.stream.Collectors;
+import java.io.Console;
+
+import com.worksap.nlp.sudachi.WordId;
+import com.worksap.nlp.sudachi.SudachiCommandLine.FileOrStdoutPrintStream;
 
 public class DictionaryPrinter {
+    private final PrintStream output;
+    private final GrammarImpl grammar;
+    private final LexiconSet lexicon;
+    private final List<String> posStrings;
+    private final boolean isUser;
+    private final int entrySize;
+
+    DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictionary base) {
+        if (dic.getDictionaryHeader().isUserDictionary() && base == null) {
+            throw new IllegalArgumentException("System dictionary is required to print user dictionary");
+        }
 
-    private DictionaryPrinter() {
-    }
+        this.output = output;
 
-    static void printDictionary(String filename, BinaryDictionary systemDict, PrintStream output) throws IOException {
-        GrammarImpl grammar = null;
+        if (base == null) {
+            isUser = false;
+            grammar = dic.getGrammar();
+            lexicon = new LexiconSet(dic.getLexicon(), grammar.getSystemPartOfSpeechSize());
+        } else {
+            isUser = true;
+            grammar = base.getGrammar();
+            lexicon = new LexiconSet(base.getLexicon(), grammar.getSystemPartOfSpeechSize());
 
-        try (BinaryDictionary dictionary = new BinaryDictionary(filename)) {
-            if (dictionary.getDictionaryHeader().isSystemDictionary()) {
-                grammar = dictionary.getGrammar();
-            } else if (systemDict == null) {
-                throw new IllegalArgumentException("the system dictionary is not specified");
-            } else {
-                grammar = systemDict.getGrammar();
-                if (DictionaryVersion.hasGrammar(dictionary.getDictionaryHeader().getVersion())) {
-                    grammar.addPosList(dictionary.getGrammar());
-                }
+            lexicon.add(dic.getLexicon(), (short) grammar.getPartOfSpeechSize());
+            if (DictionaryVersion.hasGrammar(dic.getDictionaryHeader().getVersion())) {
+                grammar.addPosList(dic.getGrammar());
             }
+        }
 
-            List<String> posStrings = new ArrayList<>();
-            for (short pid = 0; pid < grammar.getPartOfSpeechSize(); pid++) {
-                posStrings.add(String.join(",", grammar.getPartOfSpeechString(pid)));
-            }
+        List<String> poss = new ArrayList<>();
+        for (short pid = 0; pid < grammar.getPartOfSpeechSize(); pid++) {
+            poss.add(String.join(",", grammar.getPartOfSpeechString(pid)));
+        }
+        this.posStrings = poss;
+
+        this.entrySize = dic.getLexicon().size();
+    }
+
+    private static void printUsage() {
+        Console console = System.console();
+        console.printf("usage: PrintDictionary [-o file] [-s file] file\n");
+        console.printf("\t-o file\toutput to file\n");
+        console.printf("\t-s file\tsystem dictionary\n");
+    }
 
-            Lexicon lexicon = dictionary.getLexicon();
-            for (int wordId = 0; wordId < lexicon.size(); wordId++) {
-                short leftId = lexicon.getLeftId(wordId);
-                short rightId = lexicon.getRightId(wordId);
-                short cost = lexicon.getCost(wordId);
-                WordInfo wordInfo = lexicon.getWordInfo(wordId);
-
-                char unitType = getUnitType(wordInfo);
-
-                output.println(String.format("%s,%d,%d,%d,%s,%s,%s,%s,%s,%c,%s,%s,%s", wordInfo.getSurface(), leftId,
-                        rightId, cost, wordInfo.getSurface(), posStrings.get(wordInfo.getPOSId()),
-                        wordInfo.getReadingForm(), wordInfo.getNormalizedForm(),
-                        wordIdToString(wordInfo.getDictionaryFormWordId()), unitType,
-                        splitToString(wordInfo.getAunitSplit()), splitToString(wordInfo.getBunitSplit()),
-                        splitToString(wordInfo.getWordStructure())));
+    void printEntries() {
+        int dic = isUser ? 1 : 0;
+        for (int wordId = 0; wordId < entrySize; wordId++) {
+            printEntry(WordId.make(dic, wordId));
+        }
+    }
+
+    private void printEntry(int wordId) {
+        short leftId = lexicon.getLeftId(wordId);
+        short rightId = lexicon.getRightId(wordId);
+        short cost = lexicon.getCost(wordId);
+        WordInfo wordInfo = lexicon.getWordInfo(wordId);
+
+        field(maybeEscapeString(wordInfo.getSurface()));
+        field(leftId);
+        field(rightId);
+        field(cost);
+        field(maybeEscapeString(wordInfo.getSurface()));
+        field(posStrings.get(wordInfo.getPOSId()));
+        field(maybeEscapeString(wordInfo.getReadingForm()));
+        field(maybeEscapeString(wordInfo.getNormalizedForm()));
+        field(wordRefToString(wordInfo.getDictionaryFormWordId()));
+        field(getUnitType(wordInfo));
+        field(splitToString(wordInfo.getAunitSplit()));
+        field(splitToString(wordInfo.getBunitSplit()));
+        field(splitToString(wordInfo.getWordStructure()));
+        lastField(synonymIdList(wordInfo.getSynonymGoupIds()));
+        output.print("\n");
+    }
+
+    void field(short value) {
+        output.print(value);
+        output.print(',');
+    }
+
+    void field(char value) {
+        output.print(value);
+        output.print(',');
+    }
+
+    void field(String value) {
+        output.print(value);
+        output.print(',');
+    }
+
+    void lastField(String value) {
+        output.print(value);
+    }
+
+    String synonymIdList(int[] ints) {
+        if (ints.length == 0) {
+            return "*";
+        }
+        return String.join("/",
+                Arrays.stream(ints).boxed().map(i -> String.format("%06d", i)).collect(Collectors.toList()));
+    }
+
+    private static boolean hasCh(String value, int ch) {
+        return value.indexOf(ch) != -1;
+    }
+
+    /** escape string field of csv. */
+    private String maybeEscapeString(String value) {
+        boolean hasCommas = hasCh(value, ',');
+        boolean hasQuotes = hasCh(value, '"');
+        if (!hasCommas && !hasQuotes) {
+            return value;
+        }
+        return unicodeEscape(value, Arrays.asList('"', ','));
+    }
+
+    /** escape specified (ascii) chars as unicode codepoint */
+    private String unicodeEscape(String value, List<Character> targetChars) {
+        StringBuilder sb = new StringBuilder(value.length() + 10);
+        int len = value.length();
+        for (int i = 0; i < len; ++i) {
+            char c = value.charAt(i);
+            if (targetChars.contains(c)) {
+                // assume all target chars are ascii
+                sb.append("\\u00").append(Integer.toHexString(c));
+            } else {
+                sb.append(c);
             }
         }
+        return sb.toString();
     }
 
-    static String wordIdToString(int wid) {
-        return (wid < 0) ? "*" : Integer.toString(wid);
+    String wordRefToString(int wid) {
+        if (wid < 0) {
+            return "*";
+        }
+        return "\"" + wordRef(wid) + "\"";
     }
 
     static char getUnitType(WordInfo info) {
@@ -81,14 +177,20 @@ static char getUnitType(WordInfo info) {
         }
     }
 
-    static String splitToString(int[] split) {
+    private String splitToString(int[] split) {
         if (split.length == 0) {
             return "*";
-        } else {
-            return Arrays.stream(split)
-                    .mapToObj(i -> (i >> 28 != 0) ? "U" + Integer.toString(i & ((1 << 28) - 1)) : Integer.toString(i))
-                    .collect(Collectors.joining("/"));
         }
+        return "\"" + Arrays.stream(split).mapToObj(this::wordRef).collect(Collectors.joining("/")) + "\"";
+    }
+
+    private String wordRef(int wordId) {
+        WordInfo info = lexicon.getWordInfo(wordId);
+        String surface = maybeEscapeString(info.getSurface());
+        short posId = info.getPOSId();
+        String pos = grammar.getPartOfSpeechString(posId).toString();
+        String reading = maybeEscapeString(info.getReadingForm());
+        return String.format("%s,%s,%s", surface, pos, reading);
     }
 
     /**
@@ -111,25 +213,38 @@ static String splitToString(int[] split) {
      *             if IO
      */
     public static void main(String[] args) throws IOException {
-        BinaryDictionary systemDict = null;
-
-        try {
-            int i = 0;
-            for (i = 0; i < args.length; i++) {
-                if (args[i].equals("-s") && i + 1 < args.length) {
-                    systemDict = BinaryDictionary.loadSystem(args[++i]);
-                } else if (args[i].equals("-h")) {
-                    System.err.println("usage: PrintDictionary [-s file] file");
-                    System.err.println("\t-s file\tsystem dictionary");
-                    return;
-                } else {
-                    break;
-                }
+        String systemDictPath = null;
+        String outputFileName = null;
+
+        int i = 0;
+        for (i = 0; i < args.length; i++) {
+            if (args[i].equals("-h")) {
+                printUsage();
+                return;
+            } else if (args[i].equals("-o") && i + 1 < args.length) {
+                outputFileName = args[++i];
+            } else if (args[i].equals("-s") && i + 1 < args.length) {
+                systemDictPath = args[++i];
+            } else {
+                break;
             }
+        }
+        if (i >= args.length) {
+            System.console().printf("target dictionary file is missing");
+            return;
+        }
 
-            if (i < args.length) {
-                printDictionary(args[i], systemDict, System.out);
+        String dictPath = args[i];
+        BinaryDictionary systemDict = null;
+        try (BinaryDictionary dict = new BinaryDictionary(dictPath);
+                PrintStream output = outputFileName == null ? new FileOrStdoutPrintStream()
+                        : new FileOrStdoutPrintStream(outputFileName);) {
+            if (systemDictPath != null) {
+                systemDict = BinaryDictionary.loadSystem(systemDictPath);
             }
+
+            DictionaryPrinter printer = new DictionaryPrinter(output, dict, systemDict);
+            printer.printEntries();
         } finally {
             if (systemDict != null) {
                 systemDict.close();

diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java
@@ -114,6 +114,7 @@ WordEntry parseLine(List<String> cols) {
         POS pos = new POS(cols.get(5), cols.get(6), cols.get(7), cols.get(8), cols.get(9), cols.get(10));
         short posId = posTable.getId(pos);
 
+        entry.dictionaryFormString = cols.get(13);
         entry.aUnitSplitString = cols.get(15);
         entry.bUnitSplitString = cols.get(16);
         entry.wordStructureString = cols.get(17);
@@ -131,14 +132,57 @@ WordEntry parseLine(List<String> cols) {
 
         entry.wordInfo = new WordInfo(cols.get(4), // headword
                 (short) cols.get(0).getBytes(StandardCharsets.UTF_8).length, posId, cols.get(12), // normalizedForm
-                (cols.get(13).equals("*") ? -1 : Integer.parseInt(cols.get(13))), // dictionaryFormWordId
-                "", // dummy
+                -1, "", // dictioanyForm (dummy)
                 cols.get(11), // readingForm
                 null, null, null, synonymGids);
 
         return entry;
     }
 
+    /**
+     * Parse dictionary_form string in lexicon as referring word id.
+     * 
+     * Allow id (line no) or triple format. Forbid to refer word outside this
+     * lexicon (user word cannot use system word as dictionary form).
+     */
+    int parseDictionaryForm(String str) {
+        if (str.equals("*")) {
+            return -1;
+        }
+        try {
+            return Integer.parseInt(str);
+        } catch (NumberFormatException e) {
+            // noop
+        }
+        int wordId = wordToId(str);
+        if (wordId < 0) {
+            throw new IllegalArgumentException("couldn't find " + str + " in the dictionaries");
+        }
+        if (!wordRefMatches(str, wordId)) {
+            throw new IllegalArgumentException("dictionary form must exist in the same lexicon");
+        }
+        return wordId;
+    }
+
+    /**
+     * Check if wordRef matches to the word in this lexicon.
+     */
+    boolean wordRefMatches(String ref, int wordId) {
+        if (wordId < 0 || entries.size() <= wordId) {
+            return false;
+        }
+        WordEntry entry = entries.get(wordId);
+
+        String[] cols = ref.split(",", 8);
+        String headword = unescape(cols[0]);
+        POS pos = new POS(Arrays.copyOfRange(cols, 1, 7));
+        short posId = posTable.getId(pos);
+        String reading = unescape(cols[7]);
+
+        return headword.equals(entry.headword) && posId == entry.wordInfo.getPOSId()
+                && reading.equals(entry.wordInfo.getReadingForm());
+    }
+
     int[] parseSynonymGids(String str) {
         if (str.equals("*")) {
             return new int[0];
@@ -246,7 +290,7 @@ public void writeTo(ModelOutput output) throws IOException {
                 buffer.putLength(wi.getLength());
                 buffer.putShort(wi.getPOSId());
                 buffer.putEmptyIfEqual(wi.getNormalizedForm(), wi.getSurface());
-                buffer.putInt(wi.getDictionaryFormWordId());
+                buffer.putInt(parseDictionaryForm(entry.dictionaryFormString));
                 buffer.putEmptyIfEqual(wi.getReadingForm(), wi.getSurface());
                 buffer.putInts(parseSplitInfo(entry.aUnitSplitString));
                 buffer.putInts(parseSplitInfo(entry.bUnitSplitString));
@@ -277,6 +321,7 @@ public void setLimits(int left, int right) {
     public static class WordEntry {
         String headword;
         WordInfo wordInfo;
+        String dictionaryFormString;
         String aUnitSplitString;
         String bUnitSplitString;
         String wordStructureString;