diff --git a/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java b/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java new file mode 100644 index 00000000..749fa45a --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2024 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi; + +import java.util.List; +import java.io.IOException; +import java.util.ArrayList; + +import com.worksap.nlp.sudachi.dictionary.CharacterCategory; +import com.worksap.nlp.sudachi.dictionary.Grammar; +import com.worksap.nlp.sudachi.dictionary.GrammarImpl; + +/** + * A text normalizer. + */ +public class TextNormalizer { + private final Grammar grammar; + private final List inputTextPlugins; + + /** + * Create a TextNormalizer from a grammar and input text plugins. + * + * Grammar must have + * {@link com.worksap.nlp.sudachi.dictionary.CharacterCategory}. + */ + public TextNormalizer(Grammar grammar, List inputTextPlugins) { + if (grammar.getCharacterCategory() == null) { + throw new IllegalArgumentException("grammar for TextNormalizer must have CharacterCategory."); + } + this.grammar = grammar; + this.inputTextPlugins = inputTextPlugins; + } + + /** + * Create a TextNormalizer from a grammar. + * + * Grammar must have a + * {@link com.worksap.nlp.sudachi.dictionary.CharacterCategory}. + * {@link DefaultInputTextPlugin} will be used. + */ + public TextNormalizer(Grammar grammar) throws IOException { + this(grammar, setupDefaultInputTextPlugins(grammar)); + } + + /** + * Create a default TextNormalizer that uses default + * {@link com.worksap.nlp.sudachi.dictionary.CharacterCategory} and + * {@link DefaultInputTextPlugin}. + */ + public static TextNormalizer defaultTextNormalizer() throws IOException { + Grammar grammar = new GrammarImpl(); + grammar.setCharacterCategory(CharacterCategory.loadDefault()); + return new TextNormalizer(grammar); + } + + /** + * Create TextNormalizer based on the {@link JapaneseDictionary}. + */ + public static TextNormalizer fromDictionary(JapaneseDictionary dictionary) { + return new TextNormalizer(dictionary.getGrammar(), dictionary.inputTextPlugins); + } + + /** + * Setup {@link DefaultInputTextPlugin} using a grammar. + */ + private static List setupDefaultInputTextPlugins(Grammar grammar) throws IOException { + PathAnchor anchor = PathAnchor.classpath(); + List> pconfs = Config.fromJsonString( + "{\"inputTextPlugin\":[{\"class\":\"com.worksap.nlp.sudachi.DefaultInputTextPlugin\"}]}", anchor) + .getInputTextPlugins(); + + List plugins = new ArrayList<>(); + for (Config.PluginConf pconf : pconfs) { + InputTextPlugin p = pconf.instantiate(anchor); + p.setUp(grammar); + plugins.add(p); + } + + return plugins; + } + + /** Normalize given text */ + public String normalize(CharSequence text) { + UTF8InputTextBuilder builder = new UTF8InputTextBuilder(text, grammar); + for (InputTextPlugin plugin : inputTextPlugins) { + plugin.rewrite(builder); + } + UTF8InputText input = builder.build(); + return input.getText(); + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/CharacterCategory.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/CharacterCategory.java index 92f8a5bd..46ccd7fd 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/CharacterCategory.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/CharacterCategory.java @@ -17,6 +17,7 @@ package com.worksap.nlp.sudachi.dictionary; import com.worksap.nlp.sudachi.Config; +import com.worksap.nlp.sudachi.PathAnchor; import java.io.*; import java.nio.charset.StandardCharsets; @@ -157,4 +158,9 @@ public static CharacterCategory load(Config.Resource resource return result; }); } + + public static CharacterCategory loadDefault() throws IOException { + Config.Resource defaultResource = PathAnchor.classpath().resource("char.def"); + return load(defaultResource); + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index b79de04b..06db1d59 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -25,17 +25,19 @@ import java.io.Console; import com.worksap.nlp.sudachi.WordId; +import com.worksap.nlp.sudachi.TextNormalizer; import com.worksap.nlp.sudachi.SudachiCommandLine.FileOrStdoutPrintStream; public class DictionaryPrinter { private final PrintStream output; + private final TextNormalizer textNormalizer; private final GrammarImpl grammar; private final LexiconSet lexicon; private final List posStrings; private final boolean isUser; private final int entrySize; - DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictionary base) { + DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictionary base) throws IOException { if (dic.getDictionaryHeader().isUserDictionary() && base == null) { throw new IllegalArgumentException("System dictionary is required to print user dictionary"); } @@ -57,6 +59,10 @@ public class DictionaryPrinter { } } + // set default char category for text normalizer + grammar.setCharacterCategory(CharacterCategory.loadDefault()); + textNormalizer = new TextNormalizer(grammar); + List poss = new ArrayList<>(); for (short pid = 0; pid < grammar.getPartOfSpeechSize(); pid++) { poss.add(String.join(",", grammar.getPartOfSpeechString(pid))); @@ -86,7 +92,7 @@ private void printEntry(int wordId) { short cost = lexicon.getCost(wordId); WordInfo wordInfo = lexicon.getWordInfo(wordId); - field(maybeEscapeString(wordInfo.getSurface())); + field(maybeEscapeString(textNormalizer.normalize(wordInfo.getSurface()))); field(leftId); field(rightId); field(cost); diff --git a/src/test/dict/lex.csv b/src/test/dict/lex.csv index 6b8fb50d..8ec1f979 100644 --- a/src/test/dict/lex.csv +++ b/src/test/dict/lex.csv @@ -35,5 +35,5 @@ いく,4,4,5105,いく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,* いっ,5,5,5122,いっ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,A,*,*,*,* 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,2478,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,*,A,*,*,*,* -特a,8,8,2914,特a,名詞,普通名詞,一般,*,*,*,トクエー,特a,*,A,*,*,*,* +特a,8,8,2914,特A,名詞,普通名詞,一般,*,*,*,トクエー,特A,*,A,*,*,*,* な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,*,A,*,*,*,* \ No newline at end of file diff --git a/src/test/java/com/worksap/nlp/sudachi/TextNormalizerTest.kt b/src/test/java/com/worksap/nlp/sudachi/TextNormalizerTest.kt new file mode 100644 index 00000000..81a7bdc9 --- /dev/null +++ b/src/test/java/com/worksap/nlp/sudachi/TextNormalizerTest.kt @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi + +import com.worksap.nlp.sudachi.dictionary.CharacterCategory +import com.worksap.nlp.sudachi.dictionary.GrammarImpl +import kotlin.test.* + +class TextNormalizerTest { + + private val dic = + DictionaryFactory() + .create(TestDictionary.user2Cfg().characterDefinition(CharacterCategory.loadDefault())) + as JapaneseDictionary + + @Test + fun instantiation() { + TextNormalizer.fromDictionary(dic) + TextNormalizer(dic.getGrammar()) + TextNormalizer(dic.getGrammar(), dic.inputTextPlugins) + TextNormalizer.defaultTextNormalizer() + } + + @Test + fun failToInstantiateWithoutCharCategory() { + val grammar = GrammarImpl() + assertFails { TextNormalizer(grammar) } + } + + @Test + fun normalizeText() { + val tn = TextNormalizer.defaultTextNormalizer() + + // from DefaultInputTextPlugin test + assertEquals("âbγд(株)ガヴ⼼ⅲ", tn.normalize("ÂBΓД㈱ガウ゛⼼Ⅲ")) + } + + @Test + fun normalizeTextWithDefaultConfig() { + // will use default config, which has InputTextPlugins of + // [Default, ProlongedSoundMark, IgnoreYomigana] + val tn = TextNormalizer.fromDictionary(dic) + print(dic.inputTextPlugins) + + assertEquals("âbγд(株)ガヴ⼼ⅲ", tn.normalize("ÂBΓД㈱ガウ゛⼼Ⅲ")) // default + assertEquals("うわーい", tn.normalize("うわーーーい")) // prolonged sound mark + assertEquals("小鳥遊", tn.normalize("小鳥遊(タカナシ)")) // ignore yomigana + } +} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.java index a81b2b23..57233a87 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.java @@ -75,6 +75,7 @@ public void printWithSystemDict() throws IOException { } assertThat(actuals.length, is(40)); assertThat(actuals[0], is("た,1,1,8729,た,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,*,A,*,*,*,*")); + assertThat(actuals[37], is("特a,8,8,2914,特A,名詞,普通名詞,一般,*,*,*,トクエー,特A,*,A,*,*,*,*")); } @Test diff --git a/src/test/resources/dict/lex.csv b/src/test/resources/dict/lex.csv index d8214f6b..f4f97cc9 100644 --- a/src/test/resources/dict/lex.csv +++ b/src/test/resources/dict/lex.csv @@ -35,6 +35,6 @@ いく,4,4,5105,いく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,* いっ,5,5,5122,いっ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,A,*,*,*,* 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,-9000,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,*,A,*,*,*,* -特a,8,8,2914,特a,名詞,普通名詞,一般,*,*,*,トクエー,特a,*,A,*,*,*,* +特a,8,8,2914,特A,名詞,普通名詞,一般,*,*,*,トクエー,特A,*,A,*,*,*,* 隠し,-1,-1,0,隠し,名詞,普通名詞,一般,*,*,*,カクシ,隠し,*,A,*,*,*,* な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,*,C,11,11,*,*