From 62183a001a8076f2f31bc23df64257ce22fbd4a9 Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Fri, 11 Feb 2022 18:11:09 +0900 Subject: [PATCH 01/94] non-boxing variation of dictionary lookup --- .../worksap/nlp/sudachi/dictionary/WordIdTable.java | 2 +- .../com/worksap/nlp/sudachi/dictionary/WordLookup.java | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java index 97e1f581..b79eb17b 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java @@ -48,7 +48,7 @@ Integer[] get(int index) { /** * Reads the word IDs to the passed WordLookup object - * + * * @param index * index in the word array * @param lookup diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordLookup.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordLookup.java index a02dfbec..705f5f8a 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordLookup.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordLookup.java @@ -48,7 +48,7 @@ private void rebind(DoubleArrayLexicon lexicon) { /** * Start the search for new key - * + * * @param key * utf-8 bytes corresponding to the trie key * @param offset @@ -65,7 +65,7 @@ public void reset(byte[] key, int offset, int limit) { /** * This is not public API. Returns the array for wordIds with the length at * least equal to the passed parameter - * + * * @param length * minimum requested length * @return WordId array @@ -79,7 +79,7 @@ public int[] outputBuffer(int length) { /** * Sets the wordIds, numWords, endOffset to the - * + * * @return true if there was an entry in any of binary dictionaries */ public boolean next() { @@ -98,7 +98,7 @@ public boolean next() { /** * Returns trie key end offset - * + * * @return number of utf-8 bytes corresponding to the end of key */ public int getEndOffset() { @@ -116,7 +116,7 @@ public int getNumWords() { /** * Returns array of word ids. Number of correct entries is specified by * {@link #getNumWords()}. WordIds have their dictionary part set. - * + * * @return array consisting word ids for the current index entry * @see WordId */ From 854410f5511eb4636aa424d0bcb653549e9ec48a Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Tue, 22 Mar 2022 12:08:11 +0900 Subject: [PATCH 02/94] prototype of dictionary redo super WIP --- .gitignore | 4 +- .../worksap/nlp/sudachi/dictionary/Ints.java | 47 ++++++ .../sudachi/dictionary/build/CsvLexicon.java | 36 +++- .../sudachi/dictionary/build/DicBuffer.java | 4 - .../sudachi/dictionary/build/DicStrings.java | 23 +++ .../sudachi/dictionary/build/IOConsumer.java | 12 ++ .../dictionary/build/InMemoryChannel.java | 79 +++++++++ .../sudachi/dictionary/build/StringPtr.java | 59 +++++++ .../dictionary/build/UnicodeBuffer.java | 47 ++++++ .../dictionary/build/UniqueStrings.java | 157 ++++++++++++++++++ .../dictionary/build/WordIdResolver.java | 8 +- .../build/WordInfoLayoutFixedWidth.java | 74 +++++++++ .../sudachi/dictionary/build/WordLayout.java | 39 +++++ .../sudachi/dictionary/build/StringPtrTest.kt | 34 ++++ .../dictionary/build/UniqueStringsTest.kt | 33 ++++ 15 files changed, 647 insertions(+), 9 deletions(-) create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicStrings.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOConsumer.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringPtr.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBuffer.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStrings.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordInfoLayoutFixedWidth.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java create mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringPtrTest.kt create mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStringsTest.kt diff --git a/.gitignore b/.gitignore index 2cf51d47..f0c3a2e2 100644 --- a/.gitignore +++ b/.gitignore @@ -120,8 +120,8 @@ settings.xml ### GRADLE ### -build/ -.gradle/ +/build/ +/.gradle/ !gradle/wrapper/gradle-wrapper.jar out/ diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java new file mode 100644 index 00000000..fb3751e9 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java @@ -0,0 +1,47 @@ +package com.worksap.nlp.sudachi.dictionary; + +import java.util.Arrays; + +public class Ints { + private int[] data; + private int length; + + public Ints(int capacity) { + data = new int[capacity]; + length = 0; + } + + public int get(int index) { + assert index < length; + return data[index]; + } + + public int set(int index, int value) { + int old = data[index]; + data[index] = value; + return old; + } + + public int length() { + return length; + } + + public void append(int value) { + maybeResize(1); + int idx = this.length; + data[idx] = value; + length = idx + 1; + } + + public void clear() { + length = 0; + } + + public void maybeResize(int additional) { + int newSize = length + additional; + if (newSize > data.length) { + data = Arrays.copyOf(data, Math.max(newSize, length * 2)); + } + } + +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java index f1a86a70..ae7a0ed3 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java @@ -108,7 +108,13 @@ WordEntry parseLine(List cols) { } // left-id, right-id, cost - parameters.add(Short.parseShort(cols.get(1)), Short.parseShort(cols.get(2)), Short.parseShort(cols.get(3))); + short leftId = Short.parseShort(cols.get(1)); + short rightId = Short.parseShort(cols.get(2)); + short cost = Short.parseShort(cols.get(3)); + parameters.add(leftId, rightId, cost); + entry.leftId = leftId; + entry.rightId = rightId; + entry.cost = cost; // part of speech POS pos = new POS(cols.get(5), cols.get(6), cols.get(7), cols.get(8), cols.get(9), cols.get(10)); @@ -279,6 +285,34 @@ public static class WordEntry { WordInfo wordInfo; String aUnitSplitString; String bUnitSplitString; + String cUnitSplitString; + String userData; String wordStructureString; + short leftId; + short rightId; + short cost; + short surfaceUtf8Length; + int expectedSize = 0; + + private int countSplits(String data) { + return (int)data.chars().filter(c -> c == '/').count(); + } + + public int computeExpectedSize() { + if (expectedSize != 0) { + return expectedSize; + } + + int size = 32; + + size += countSplits(aUnitSplitString) * 4; + size += countSplits(bUnitSplitString) * 4; + size += countSplits(cUnitSplitString) * 4; + size += countSplits(wordStructureString) * 4; + size += wordInfo.getSynonymGoupIds().length * 4; + + expectedSize = size; + return size; + } } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuffer.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuffer.java index 04476697..fe5c3c35 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuffer.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuffer.java @@ -122,8 +122,4 @@ public void putInts(int[] data) { } } - @FunctionalInterface - public interface IOConsumer { - T accept(ByteBuffer arg) throws IOException; - } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicStrings.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicStrings.java new file mode 100644 index 00000000..d6b87de2 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicStrings.java @@ -0,0 +1,23 @@ +package com.worksap.nlp.sudachi.dictionary.build; + +import java.util.HashMap; + +public class DicStrings { + private HashMap counts; + + static int length(int pointer) { + int b0 = pointer >>> 24; + int additional = Math.max(0, b0 - (255 - 8)); + int b1 = (pointer & 0x00ffffff) >>> (24 - additional); + return b0 + b1 * 8 - additional; + } + + static int offset(int pointer) { + int b0 = pointer >>> 24; + int additional = Math.max(0, b0 - (255 - 8)); + int mask = 0x00ffffff >>> additional; + int rawOffset = pointer & mask; + return rawOffset << (additional + 1); + } + +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOConsumer.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOConsumer.java new file mode 100644 index 00000000..46edc5d9 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOConsumer.java @@ -0,0 +1,12 @@ +package com.worksap.nlp.sudachi.dictionary.build; + +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + * A version of {@link java.util.function.Consumer} which allows throwing IOException + */ +@FunctionalInterface +public interface IOConsumer { + T accept(ByteBuffer arg) throws IOException; +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java new file mode 100644 index 00000000..52e3dd84 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java @@ -0,0 +1,79 @@ +package com.worksap.nlp.sudachi.dictionary.build; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.SeekableByteChannel; + +public final class InMemoryChannel implements SeekableByteChannel { + private ByteBuffer buffer; + + public InMemoryChannel() { + this(1024 * 1024); + } + + public InMemoryChannel(int size) { + buffer = ByteBuffer.allocate(size); + buffer.order(ByteOrder.LITTLE_ENDIAN); + } + + public void reserve(int needed) { + if (buffer.remaining() < needed) { + ByteBuffer old = buffer; + buffer = ByteBuffer.allocate(buffer.capacity() * 2); + buffer.order(ByteOrder.LITTLE_ENDIAN); + old.flip(); + buffer.put(old); + } + } + + @Override + public int read(ByteBuffer dst) throws IOException { + int position = buffer.position(); + buffer.put(dst); + int newPosition = buffer.position(); + return newPosition - position; + } + + @Override + public int write(ByteBuffer src) throws IOException { + reserve(src.remaining()); + int pos = buffer.position(); + buffer.put(src); + return buffer.position() - pos; + } + + @Override + public long position() throws IOException { + return buffer.position(); + } + + @Override + public SeekableByteChannel position(long newPosition) throws IOException { + assert newPosition < Integer.MAX_VALUE; + buffer.position((int)newPosition); + return this; + } + + @Override + public long size() throws IOException { + return buffer.limit(); + } + + @Override + public SeekableByteChannel truncate(long size) { + assert size < Integer.MAX_VALUE; + buffer.limit((int)size); + return this; + } + + @Override + public boolean isOpen() { + return true; + } + + @Override + public void close() throws IOException { + // always open + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringPtr.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringPtr.java new file mode 100644 index 00000000..d28dda8a --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringPtr.java @@ -0,0 +1,59 @@ +package com.worksap.nlp.sudachi.dictionary.build; + +public class StringPtr { + public static final int MAX_LENGTH_BITS = 11; + public static final int BASE_OFFSET = 32 - 5; + public static final int MAX_SIMPLE_LENGTH = 31 - MAX_LENGTH_BITS - 1; + public static final int MAX_LENGTH = 4095 + MAX_SIMPLE_LENGTH; + + private final int length; + private final int offset; + + private StringPtr(int length, int offset) { + this.length = length; + this.offset = offset; + } + + public static StringPtr unsafe(int length, int offset) { + return new StringPtr(length, offset); + } + + public int getLength() { + return length; + } + + public int getOffset() { + return offset; + } + + public static StringPtr decode(int pointer) { + // first 5 bits are length and marker values for additional + int base = pointer >>> BASE_OFFSET; // max value = 31 + int shift = Math.max(0, base - MAX_SIMPLE_LENGTH); // max value = 12 + // 16 - lower ignored bits, followed by max 11 additional bits of length + int nonFixedLength = (pointer & 0x07ff_0000) >>> (16 + 12 - shift); + // compute implicit bit, because first additional bit is not stored + int implicitBit = 0x8000_0000 >>> 32 - shift; + int finalLength = (base - shift) + (nonFixedLength | implicitBit); + int fixedShift = shift - 1; + int offset = (pointer & (0x07ff_ffff >>> fixedShift)) << fixedShift; + return new StringPtr(finalLength, offset); + } + + public int additionalBits() { + if (length <= MAX_SIMPLE_LENGTH) { + return 0; + } + int remaining = length - MAX_SIMPLE_LENGTH; + int firstOne = 32 - Integer.numberOfLeadingZeros(remaining); + return firstOne - 1; + } + + public int encode() { + int addBits = additionalBits(); + int baseLength = Math.min(length, MAX_SIMPLE_LENGTH); + int remainingLength = length - baseLength; + int basePart = (addBits + baseLength) << BASE_OFFSET; + return basePart; + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBuffer.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBuffer.java new file mode 100644 index 00000000..f501b582 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBuffer.java @@ -0,0 +1,47 @@ +package com.worksap.nlp.sudachi.dictionary.build; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.CharBuffer; +import java.nio.channels.WritableByteChannel; + +public class UnicodeBuffer { + private final ByteBuffer buffer; + private final WritableByteChannel channel; + + public UnicodeBuffer(WritableByteChannel channel, int size) { + this.channel = channel; + this.buffer = ByteBuffer.allocate(size); + buffer.order(ByteOrder.LITTLE_ENDIAN); + } + + public UnicodeBuffer(WritableByteChannel channel) { + this(channel, 64 * 1024); + } + + public void put(String data) throws IOException { + CharBuffer chars = prepare(data.length()); + chars.put(data); + } + + private CharBuffer prepare(int numChars) throws IOException { + int remaining = buffer.remaining(); + int byteLength = numChars * 2; + if (remaining < byteLength) { + buffer.flip(); + channel.write(buffer); + buffer.clear(); + if (buffer.remaining() < byteLength) { + throw new IllegalArgumentException("string length is too long: " + numChars); + } + } + CharBuffer chars = buffer.asCharBuffer(); + buffer.position(buffer.position() + byteLength); + return chars; + } + + public void flush() throws IOException { + channel.write(buffer); + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStrings.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStrings.java new file mode 100644 index 00000000..46ad88b2 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStrings.java @@ -0,0 +1,157 @@ +package com.worksap.nlp.sudachi.dictionary.build; + +import com.worksap.nlp.sudachi.dictionary.CSVParser; + +import java.io.BufferedReader; +import java.io.IOException; +import java.nio.channels.SeekableByteChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardOpenOption; +import java.util.*; + +public class UniqueStrings { + private final HashMap strings = new HashMap<>(); + private final HashMap candidates = new HashMap<>(); + + void add(String data) { + strings.put(data, null); + } + + void compile() { + candidates.clear(); + candidates.put("", new Item("", 0, 0)); + List collect = new ArrayList<>(strings.keySet()); + collect.sort(Comparator.comparingInt(String::length).reversed().thenComparing(String::compareTo)); + for (String str: collect) { + strings.put(str, process(str)); + } + candidates.clear(); + } + + private Item process(String str) { + Item present = candidates.get(str); + if (present != null) { + return present; + } + + int length = str.length(); + int[] offsets = new int[length + 1]; + int numOffsets = computeOffsets(str, offsets); + + for (int i = 0; i < numOffsets; ++i) { + int start = offsets[i]; + for (int j = i + 1; j <= numOffsets; ++j) { + int end = offsets[j]; + String sub = str.substring(start, end); + if (!candidates.containsKey(sub)) { + Item item = new Item(str, start, end); + candidates.put(sub, item); + } + } + } + + return candidates.get(str); + } + + private int computeOffsets(String str, int[] offsets) { + int count = 0; + int len = str.length(); + for (int i = 0; i < len; ++i) { + char ch = str.charAt(i); + if (Character.isLowSurrogate(ch)) { + if (i + 1 < len && Character.isHighSurrogate(str.charAt(i + 1))) { + i += 1; + } + } + offsets[count] = i; + count += 1; + } + offsets[count] = len; + return count; + } + + public HashMap getStrings() { + return strings; + } + + public void writeCompact(SeekableByteChannel channel) throws IOException { + UnicodeBuffer buffer = new UnicodeBuffer(channel); + for (Map.Entry item: strings.entrySet()) { + Item value = item.getValue(); + if (value.start == 0 && value.end == value.data.length()) { + buffer.put(value.data); + } + } + buffer.flush(); + } + + public void writeLengthPrefixedCompact(SeekableByteChannel channel) throws IOException { + DicBuffer buf = new DicBuffer(64 * 1024); + for (Map.Entry item: strings.entrySet()) { + Item value = item.getValue(); + String sub = value.data.substring(value.start, value.end); + if (buf.wontFit(sub.length() * 2)) { + buf.consume(channel::write); + } + buf.put(sub); + } + buf.consume(channel::write); + } + + public static class Item { + private final String data; + private final int start; + private final int end; + private Item root; + + public Item(String data, int start, int end) { + this.data = data; + this.start = start; + this.end = end; + } + + public String getData() { + return data; + } + + public int getStart() { + return start; + } + + public int getEnd() { + return end; + } + + public int getLength() { + return end - start; + } + } + + + public static void main(String[] args) throws IOException { + UniqueStrings strings = new UniqueStrings(); + try (BufferedReader reader = Files.newBufferedReader(Paths.get(args[0]))) { + CSVParser parser = new CSVParser(reader); + List record; + while ((record = parser.getNextRecord()) != null) { + strings.add(record.get(0)); + strings.add(record.get(4)); + strings.add(record.get(11)); + strings.add(record.get(12)); + } + } + strings.compile(); + + Path fullName = Paths.get(args[1] + ".lpf"); + try (SeekableByteChannel chan = Files.newByteChannel(fullName, StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING)) { + strings.writeLengthPrefixedCompact(chan); + } + + Path compactName = Paths.get(args[1] + ".cmp"); + try (SeekableByteChannel chan = Files.newByteChannel(compactName, StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING)) { + strings.writeCompact(chan); + } + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordIdResolver.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordIdResolver.java index 84a5cef3..54a836cf 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordIdResolver.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordIdResolver.java @@ -16,10 +16,14 @@ package com.worksap.nlp.sudachi.dictionary.build; +import com.worksap.nlp.sudachi.dictionary.Ints; + public interface WordIdResolver { int lookup(String headword, short posId, String reading); - void validate(int wordId); - boolean isUser(); + + default byte parseList(String data, Ints result) { + return 0; + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordInfoLayoutFixedWidth.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordInfoLayoutFixedWidth.java new file mode 100644 index 00000000..055489a3 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordInfoLayoutFixedWidth.java @@ -0,0 +1,74 @@ +package com.worksap.nlp.sudachi.dictionary.build; + +import com.worksap.nlp.sudachi.dictionary.Ints; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +public class WordInfoLayoutFixedWidth { + private final WordIdResolver resolver; + private final ByteBuffer buffer = ByteBuffer.allocate(4 * 1024 * 1024); + private int position; + private Ints aSplits = new Ints(16); + private Ints bSplits = new Ints(16); + private Ints cSplits = new Ints(16); + private Ints wordStructure = new Ints(16); + private Ints wordOffsets = new Ints(0); + + + public WordInfoLayoutFixedWidth(WordIdResolver resolver) { + this.resolver = resolver; + buffer.order(ByteOrder.LITTLE_ENDIAN); + } + + public void process(CsvLexicon.WordEntry entry) { + + } + + public int put(CsvLexicon.WordEntry entry) { + int position = this.position + buffer.position(); + int entryPtr = position >>> 3; + buffer.putShort(entry.leftId); + buffer.putShort(entry.rightId); + buffer.putShort(entry.cost); + buffer.putShort(entry.wordInfo.getPOSId()); + // 8 bytes + buffer.putInt(0); // surfacePtr + buffer.putInt(0); // readingPtr + buffer.putInt(entryPtr); // write normalized entry pointer in second pass + buffer.putInt(entryPtr); // write dictionary form entry pointer in second pass + // 8 + 16 = 24 bytes + + byte aSplitLen = resolver.parseList(entry.aUnitSplitString, aSplits); + byte bSplitLen = resolver.parseList(entry.bUnitSplitString, bSplits); + byte cSplitLen = resolver.parseList(entry.cUnitSplitString, cSplits); + byte wordStructureLen = resolver.parseList(entry.wordStructureString, wordStructure); + byte synonymLen = (byte) entry.wordInfo.getSynonymGoupIds().length; + + buffer.putShort(entry.surfaceUtf8Length); + buffer.put(entry.userData.length() != 0 ? (byte)0 : (byte)1); + buffer.put(synonymLen); + buffer.put(cSplitLen); + buffer.put(bSplitLen); + buffer.put(aSplitLen); + buffer.put(wordStructureLen); + // 24 + 8 = 32 bytes + + // align to 8 boundary + int currentPosition = buffer.position(); + if ((currentPosition & 0x7) != 0) { + buffer.position((currentPosition & 0xffff_fff8) + 8); + } + + return entryPtr; + } + + public T consume(IOConsumer consumer) throws IOException { + position += buffer.position(); + buffer.flip(); + T result = consumer.accept(buffer); + buffer.clear(); + return result; + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java new file mode 100644 index 00000000..9b606e3d --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java @@ -0,0 +1,39 @@ +package com.worksap.nlp.sudachi.dictionary.build; + +import java.nio.channels.SeekableByteChannel; +import java.util.ArrayList; +import java.util.TreeMap; + +public class WordLayout { + private final ArrayList> items = new ArrayList<>(); + private final TreeMap free = new TreeMap<>(); + + public void add(UniqueStrings.Item item) { + + } + + public void write(SeekableByteChannel channel) { + + } + + public static int requiredAlignment(UniqueStrings.Item item) { + int length = item.getLength(); + return length; + } + + public static boolean isPowerOfTwo(int val) { + return (val & (val - 1)) == 0; + } + + public static int nextPowerOfTwo(int val) { + if (isPowerOfTwo(val)) { + return val; + } + int nlz = Integer.numberOfLeadingZeros(val); + return 1 << (32 - nlz); + } + + public static class FreeSpace { + + } +} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringPtrTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringPtrTest.kt new file mode 100644 index 00000000..cef4f51a --- /dev/null +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringPtrTest.kt @@ -0,0 +1,34 @@ +package com.worksap.nlp.sudachi.dictionary.build + +import org.junit.Test +import kotlin.test.assertEquals + + +class StringPtrTest { + + @Test + fun additionalBits() { + assertEquals(0, StringPtr.unsafe(0, 0).additionalBits()) + assertEquals(1, StringPtr.unsafe(22, 0).additionalBits()) + } + + @Test + fun lengthEncode() { + assertEquals(0, StringPtr.unsafe(0, 0).encode()) + assertEquals(0b00001000_00000000_00000000_00000000, StringPtr.unsafe(1, 0).encode()) + } + + @Test + fun decodeMaxLength() { + val encoded = 0b11111111_11111111_00000000_00000000 + val decoded = StringPtr.decode(encoded.toInt()) + assertEquals(StringPtr.MAX_LENGTH, decoded.length) + } + + @Test + fun encodeMaxLength() { + val decoded = StringPtr.unsafe(StringPtr.MAX_LENGTH, 0) + val encoded = 0b11111111_11111111_00000000_00000000 + assertEquals(encoded.toInt(), decoded.encode()) + } +} \ No newline at end of file diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStringsTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStringsTest.kt new file mode 100644 index 00000000..75714f77 --- /dev/null +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStringsTest.kt @@ -0,0 +1,33 @@ +package com.worksap.nlp.sudachi.dictionary.build + +import kotlin.test.Test +import kotlin.test.assertEquals + +class UniqueStringsTest { + + @Test + fun simple() { + val strs = UniqueStrings() + strs.add("test") + strs.add("es") + strs.compile() + val data = strs.strings; + assertEquals(2, data.size) + assertEquals(1, data["es"]?.start) + assertEquals(3, data["es"]?.end) + } + + @Test + fun oneChar() { + val strs = UniqueStrings() + strs.add("x") + strs.add("y") + strs.compile() + val data = strs.strings + assertEquals(2, data.size) + assertEquals(0, data["x"]?.start) + assertEquals(1, data["x"]?.end) + assertEquals(0, data["y"]?.start) + assertEquals(1, data["y"]?.end) + } +} \ No newline at end of file From ef9f82148f0e410431e91229df0c0a2ffbbdbaad Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Mon, 28 Mar 2022 18:25:05 +0900 Subject: [PATCH 03/94] working encode-decode for string pointers --- .../sudachi/dictionary/build/StringPtr.java | 48 +++++++++++++++---- .../sudachi/dictionary/build/StringPtrTest.kt | 38 ++++++++++++++- 2 files changed, 77 insertions(+), 9 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringPtr.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringPtr.java index d28dda8a..4621fa75 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringPtr.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringPtr.java @@ -1,5 +1,8 @@ package com.worksap.nlp.sudachi.dictionary.build; +import java.util.Objects; +import java.util.StringJoiner; + public class StringPtr { public static final int MAX_LENGTH_BITS = 11; public static final int BASE_OFFSET = 32 - 5; @@ -32,12 +35,12 @@ public static StringPtr decode(int pointer) { int shift = Math.max(0, base - MAX_SIMPLE_LENGTH); // max value = 12 // 16 - lower ignored bits, followed by max 11 additional bits of length int nonFixedLength = (pointer & 0x07ff_0000) >>> (16 + 12 - shift); - // compute implicit bit, because first additional bit is not stored - int implicitBit = 0x8000_0000 >>> 32 - shift; + // compute the non-stored first bit which is implicitly one + int implicitBit = (1 << 12) >>> 13 - shift; int finalLength = (base - shift) + (nonFixedLength | implicitBit); - int fixedShift = shift - 1; + int fixedShift = Math.max(shift - 1, 0); int offset = (pointer & (0x07ff_ffff >>> fixedShift)) << fixedShift; - return new StringPtr(finalLength, offset); + return unsafe(finalLength, offset); } public int additionalBits() { @@ -45,15 +48,44 @@ public int additionalBits() { return 0; } int remaining = length - MAX_SIMPLE_LENGTH; - int firstOne = 32 - Integer.numberOfLeadingZeros(remaining); - return firstOne - 1; + return 32 - Integer.numberOfLeadingZeros(remaining); } public int encode() { int addBits = additionalBits(); int baseLength = Math.min(length, MAX_SIMPLE_LENGTH); - int remainingLength = length - baseLength; int basePart = (addBits + baseLength) << BASE_OFFSET; - return basePart; + + int remainingLength = length - baseLength; + int implicitBit = (1 << 12) >>> (13 - addBits); + int nonFixedLength = remainingLength ^ implicitBit; + int lengthPart = nonFixedLength << 16 + 12 - addBits; + + int offsetPart = offset >>> Math.max(addBits - 1, 0); + assert (basePart & lengthPart) == 0; + assert (basePart & offsetPart) == 0; + assert (lengthPart & offsetPart) == 0; + return basePart | lengthPart | offsetPart; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + StringPtr stringPtr = (StringPtr) o; + return length == stringPtr.length && offset == stringPtr.offset; + } + + @Override + public int hashCode() { + return Objects.hash(length, offset); + } + + @Override + public String toString() { + return new StringJoiner(", ", StringPtr.class.getSimpleName() + "[", "]") + .add("length=" + length) + .add("offset=" + offset) + .toString(); } } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringPtrTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringPtrTest.kt index cef4f51a..dad27754 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringPtrTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringPtrTest.kt @@ -9,7 +9,7 @@ class StringPtrTest { @Test fun additionalBits() { assertEquals(0, StringPtr.unsafe(0, 0).additionalBits()) - assertEquals(1, StringPtr.unsafe(22, 0).additionalBits()) + assertEquals(2, StringPtr.unsafe(22, 0).additionalBits()) } @Test @@ -31,4 +31,40 @@ class StringPtrTest { val encoded = 0b11111111_11111111_00000000_00000000 assertEquals(encoded.toInt(), decoded.encode()) } + + private fun checkConversion(length: Int, offset: Int) { + val original = StringPtr.unsafe(length, offset) + val encoded = original.encode() + val decoded = StringPtr.decode(encoded) + assertEquals(original, decoded, "conversion failed, encoded value = %08x".format(encoded)) + } + + @Test + fun decodeEncodeMaxSimple() { + checkConversion(19, 0x07ff_ffff) + } + + @Test + fun decodeEncodeSimple() { + checkConversion(5, 10) + checkConversion(1, 10) + checkConversion(19, 10) + } + + @Test + fun decodeEncodeAddLength() { + // low offset bits must be aligned for large lengths + checkConversion(19 + 0b00000000_000000001, 0x07ff_ffff xor ((1 shl 0) - 1)) + checkConversion(19 + 0b00000000_000000011, 0x07ff_ffff xor ((1 shl 1) - 1)) + checkConversion(19 + 0b00000000_000000111, 0x07ff_ffff xor ((1 shl 2) - 1)) + checkConversion(19 + 0b00000000_000001111, 0x07ff_ffff xor ((1 shl 3) - 1)) + checkConversion(19 + 0b00000000_000011111, 0x07ff_ffff xor ((1 shl 4) - 1)) + checkConversion(19 + 0b00000000_000111111, 0x07ff_ffff xor ((1 shl 5) - 1)) + checkConversion(19 + 0b00000000_001111111, 0x07ff_ffff xor ((1 shl 6) - 1)) + checkConversion(19 + 0b00000000_011111111, 0x07ff_ffff xor ((1 shl 7) - 1)) + checkConversion(19 + 0b00000000_111111111, 0x07ff_ffff xor ((1 shl 8) - 1)) + checkConversion(19 + 0b00000001_111111111, 0x07ff_ffff xor ((1 shl 9) - 1)) + checkConversion(19 + 0b00000011_111111111, 0x07ff_ffff xor ((1 shl 10) - 1)) + checkConversion(19 + 0b00000111_111111111, 0x07ff_ffff xor ((1 shl 11) - 1)) + } } \ No newline at end of file From 1d2574778b791b521de47e8b33ef923d2e9461b2 Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Wed, 6 Apr 2022 19:13:05 +0900 Subject: [PATCH 04/94] layout strings with alignment awareness --- .../dictionary/{build => }/StringPtr.java | 64 +++++--- .../sudachi/dictionary/build/DicStrings.java | 23 --- .../dictionary/build/UnicodeBuffer.java | 8 +- .../build/UnicodeBufferResizeable.java | 49 ++++++ .../dictionary/build/UniqueStrings.java | 27 ++-- .../sudachi/dictionary/build/WordLayout.java | 140 ++++++++++++++++-- .../dictionary/{build => }/StringPtrTest.kt | 23 ++- 7 files changed, 264 insertions(+), 70 deletions(-) rename src/main/java/com/worksap/nlp/sudachi/dictionary/{build => }/StringPtr.java (61%) delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicStrings.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java rename src/test/java/com/worksap/nlp/sudachi/dictionary/{build => }/StringPtrTest.kt (74%) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringPtr.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/StringPtr.java similarity index 61% rename from src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringPtr.java rename to src/main/java/com/worksap/nlp/sudachi/dictionary/StringPtr.java index 4621fa75..c56123d6 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringPtr.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/StringPtr.java @@ -1,12 +1,12 @@ -package com.worksap.nlp.sudachi.dictionary.build; +package com.worksap.nlp.sudachi.dictionary; import java.util.Objects; import java.util.StringJoiner; public class StringPtr { - public static final int MAX_LENGTH_BITS = 11; + public static final int MAX_LENGTH_BITS = 12; public static final int BASE_OFFSET = 32 - 5; - public static final int MAX_SIMPLE_LENGTH = 31 - MAX_LENGTH_BITS - 1; + public static final int MAX_SIMPLE_LENGTH = 31 - MAX_LENGTH_BITS; public static final int MAX_LENGTH = 4095 + MAX_SIMPLE_LENGTH; private final int length; @@ -21,29 +21,29 @@ public static StringPtr unsafe(int length, int offset) { return new StringPtr(length, offset); } - public int getLength() { - return length; - } - - public int getOffset() { - return offset; + public static StringPtr checked(int length, int offset) { + if (!isValid(offset, length)) { + throw new IllegalArgumentException( + String.format("StringPtr is invalid offset=%08x length=%d alignment=%d", offset, length, requiredAlignment(length))); + } + return unsafe(length, offset); } public static StringPtr decode(int pointer) { - // first 5 bits are length and marker values for additional + // first 5 bits are length and marker values for additional length bits int base = pointer >>> BASE_OFFSET; // max value = 31 - int shift = Math.max(0, base - MAX_SIMPLE_LENGTH); // max value = 12 + int addBits = Math.max(0, base - MAX_SIMPLE_LENGTH); // max value = 12 // 16 - lower ignored bits, followed by max 11 additional bits of length - int nonFixedLength = (pointer & 0x07ff_0000) >>> (16 + 12 - shift); + int nonFixedLength = (pointer & 0x07ff_0000) >>> (16 + MAX_LENGTH_BITS - addBits); // compute the non-stored first bit which is implicitly one - int implicitBit = (1 << 12) >>> 13 - shift; - int finalLength = (base - shift) + (nonFixedLength | implicitBit); - int fixedShift = Math.max(shift - 1, 0); + int implicitBit = (1 << MAX_LENGTH_BITS) >>> 13 - addBits; + int finalLength = (base - addBits) + (nonFixedLength | implicitBit); + int fixedShift = Math.max(addBits - 1, 0); int offset = (pointer & (0x07ff_ffff >>> fixedShift)) << fixedShift; return unsafe(finalLength, offset); } - public int additionalBits() { + public static int requiredAlignment(int length) { if (length <= MAX_SIMPLE_LENGTH) { return 0; } @@ -51,15 +51,37 @@ public int additionalBits() { return 32 - Integer.numberOfLeadingZeros(remaining); } + static boolean isValid(int offset, int length) { + int alignment = requiredAlignment(length); + if (alignment == 0) { + return true; + } + int alignmentStep = 1 << alignment - 1; + int alignmentMask = alignmentStep - 1; + return (offset & alignmentMask) == 0; + } + + public int getLength() { + return length; + } + + public int getOffset() { + return offset; + } + + public int additionalBits() { + return requiredAlignment(length); + } + public int encode() { int addBits = additionalBits(); int baseLength = Math.min(length, MAX_SIMPLE_LENGTH); int basePart = (addBits + baseLength) << BASE_OFFSET; int remainingLength = length - baseLength; - int implicitBit = (1 << 12) >>> (13 - addBits); + int implicitBit = (1 << MAX_LENGTH_BITS) >>> (13 - addBits); int nonFixedLength = remainingLength ^ implicitBit; - int lengthPart = nonFixedLength << 16 + 12 - addBits; + int lengthPart = nonFixedLength << 16 + MAX_LENGTH_BITS - addBits; int offsetPart = offset >>> Math.max(addBits - 1, 0); assert (basePart & lengthPart) == 0; @@ -88,4 +110,10 @@ public String toString() { .add("offset=" + offset) .toString(); } + + public boolean isSubseqValid(int start, int end) { + int realStart = offset + start; + int length = end - start; + return isValid(realStart, length); + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicStrings.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicStrings.java deleted file mode 100644 index d6b87de2..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicStrings.java +++ /dev/null @@ -1,23 +0,0 @@ -package com.worksap.nlp.sudachi.dictionary.build; - -import java.util.HashMap; - -public class DicStrings { - private HashMap counts; - - static int length(int pointer) { - int b0 = pointer >>> 24; - int additional = Math.max(0, b0 - (255 - 8)); - int b1 = (pointer & 0x00ffffff) >>> (24 - additional); - return b0 + b1 * 8 - additional; - } - - static int offset(int pointer) { - int b0 = pointer >>> 24; - int additional = Math.max(0, b0 - (255 - 8)); - int mask = 0x00ffffff >>> additional; - int rawOffset = pointer & mask; - return rawOffset << (additional + 1); - } - -} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBuffer.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBuffer.java index f501b582..088c2132 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBuffer.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBuffer.java @@ -21,8 +21,12 @@ public UnicodeBuffer(WritableByteChannel channel) { } public void put(String data) throws IOException { - CharBuffer chars = prepare(data.length()); - chars.put(data); + put(data, 0, data.length()); + } + + public void put(String data, int start, int end) throws IOException { + CharBuffer chars = prepare(end - start); + chars.put(data, start, end); } private CharBuffer prepare(int numChars) throws IOException { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java new file mode 100644 index 00000000..545d889c --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java @@ -0,0 +1,49 @@ +package com.worksap.nlp.sudachi.dictionary.build; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.CharBuffer; +import java.nio.channels.SeekableByteChannel; +import java.nio.channels.WritableByteChannel; + +public class UnicodeBufferResizeable { + private ByteBuffer buffer; + + public UnicodeBufferResizeable(int size) { + this.buffer = ByteBuffer.allocate(size); + buffer.order(ByteOrder.LITTLE_ENDIAN); + } + + public UnicodeBufferResizeable() { + this(64 * 1024); + } + + public void put(int offset, String data, int start, int end) { + CharBuffer chars = prepare(offset, end - start); + chars.put(data, start, end); + } + + private CharBuffer prepare(int offset, int numChars) { + buffer.position(offset); + int remaining = buffer.remaining(); + int byteLength = numChars * 2; + while (remaining < byteLength) { + ByteBuffer newBuffer = ByteBuffer.allocate(buffer.capacity() * 2); + newBuffer.order(ByteOrder.LITTLE_ENDIAN); + buffer.flip(); + newBuffer.put(buffer); + buffer = newBuffer; + remaining = newBuffer.remaining(); + } + CharBuffer chars = buffer.asCharBuffer(); + buffer.position(buffer.position() + byteLength); + return chars; + } + + public void write(SeekableByteChannel channel) throws IOException { + buffer.flip(); + channel.write(buffer); + buffer.clear(); + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStrings.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStrings.java index 46ad88b2..cb09a22c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStrings.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStrings.java @@ -1,6 +1,7 @@ package com.worksap.nlp.sudachi.dictionary.build; import com.worksap.nlp.sudachi.dictionary.CSVParser; +import com.worksap.nlp.sudachi.dictionary.StringPtr; import java.io.BufferedReader; import java.io.IOException; @@ -14,6 +15,7 @@ public class UniqueStrings { private final HashMap strings = new HashMap<>(); private final HashMap candidates = new HashMap<>(); + private final WordLayout layout = new WordLayout(); void add(String data) { strings.put(data, null); @@ -40,19 +42,29 @@ private Item process(String str) { int[] offsets = new int[length + 1]; int numOffsets = computeOffsets(str, offsets); + Item full = new Item(str, 0, length); + StringPtr ptr = layout.add(full); + full.root = full; + full.ptr = ptr; + candidates.put(str, full); + for (int i = 0; i < numOffsets; ++i) { int start = offsets[i]; for (int j = i + 1; j <= numOffsets; ++j) { int end = offsets[j]; String sub = str.substring(start, end); - if (!candidates.containsKey(sub)) { + // Create a possible substring only if + // 1. It does not exist yet + // 2. Can form a valid pointer to it + if (!candidates.containsKey(sub) && ptr.isSubseqValid(start, end)) { Item item = new Item(str, start, end); + item.root = full; candidates.put(sub, item); } } } - return candidates.get(str); + return full; } private int computeOffsets(String str, int[] offsets) { @@ -77,14 +89,7 @@ public HashMap getStrings() { } public void writeCompact(SeekableByteChannel channel) throws IOException { - UnicodeBuffer buffer = new UnicodeBuffer(channel); - for (Map.Entry item: strings.entrySet()) { - Item value = item.getValue(); - if (value.start == 0 && value.end == value.data.length()) { - buffer.put(value.data); - } - } - buffer.flush(); + layout.write(channel); } public void writeLengthPrefixedCompact(SeekableByteChannel channel) throws IOException { @@ -105,6 +110,7 @@ public static class Item { private final int start; private final int end; private Item root; + private StringPtr ptr; public Item(String data, int start, int end) { this.data = data; @@ -153,5 +159,6 @@ public static void main(String[] args) throws IOException { try (SeekableByteChannel chan = Files.newByteChannel(compactName, StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING)) { strings.writeCompact(chan); } + System.out.printf("wasted bytes=%d, slots=%d%n", strings.layout.wastedBytes(), strings.layout.numSlots()); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java index 9b606e3d..2a8d0986 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java @@ -1,39 +1,147 @@ package com.worksap.nlp.sudachi.dictionary.build; +import com.worksap.nlp.sudachi.dictionary.StringPtr; + +import java.io.IOException; import java.nio.channels.SeekableByteChannel; import java.util.ArrayList; -import java.util.TreeMap; +import java.util.Collections; public class WordLayout { - private final ArrayList> items = new ArrayList<>(); - private final TreeMap free = new TreeMap<>(); + private final UnicodeBufferResizeable buffer = new UnicodeBufferResizeable(); + private final ArrayList free = new ArrayList<>(); + private boolean freeDirty = false; + private int pointer; + private int maxLength = -1; - public void add(UniqueStrings.Item item) { + public StringPtr add(UniqueStrings.Item item) { + int length = item.getLength(); + int alignment = StringPtr.requiredAlignment(length); + int offset = allocate(length, alignment); + buffer.put(offset, item.getData(), item.getStart(), item.getEnd()); + return StringPtr.checked(length, offset); + } + private int allocateAligned(int length, int alignment, int start, int end) { + int requiredAlignment = Math.max(0, alignment - 1); + int alignmentStep = 1 << requiredAlignment; + int alignmentMask = alignmentStep - 1; + int alignedStart = start & ~alignmentMask; + boolean isAligned = alignedStart == start; + if (!isAligned) { + alignedStart += alignmentStep; + } + int available = end - alignedStart; + if (available < length) { + return -1; + } + if (!isAligned) { + int padding = alignedStart - start; + assert padding > 0; + free.add(new FreeSpace(start, padding)); + freeDirty = true; + maxLength = Math.max(maxLength, padding - (start & alignmentMask)); + } + return alignedStart; } - public void write(SeekableByteChannel channel) { + private int allocate(int length, int alignment) { + if (length <= maxLength) { + if (freeDirty) { + freeDirty = false; + Collections.sort(free); + } + int startIdx = Collections.binarySearch(free, new FreeSpace(0, length)); + if (startIdx < 0) { + startIdx = -startIdx - 1; + } + + int numFree = free.size(); + for (int i = startIdx; i < numFree; ++i) { + FreeSpace fs = free.get(i); + if (fs.length < length) { + continue; + } + int end = fs.start + fs.length; + int start = allocateAligned(length, alignment, fs.start, end); + if (start != -1) { + int remaining = end - start - length; + if (remaining > 0) { + fs.start = start + length; + fs.length = remaining; + freeDirty = true; + maxLength = computeNewMaxLength(i); + } else { + free.remove(i); + maxLength = computeNewMaxLength(numFree - 2); + } + return start; + } + } + maxLength = Math.max(0, maxLength - 1); + } + + int alignedStart = allocateAligned(length, alignment, pointer, Integer.MAX_VALUE); + assert alignedStart != -1; + pointer = alignedStart + length; + return alignedStart; } - public static int requiredAlignment(UniqueStrings.Item item) { - int length = item.getLength(); - return length; + private int estimatedLength(int length) { + if (length <= 20) { + return length; + } + int clz = Integer.numberOfLeadingZeros(length); + return 1 << (31 - clz); + } + + private int computeNewMaxLength(int i) { + // assumption: free is sorted ascending my length (except i-th item) + int freeLength = free.size(); + // if new size of free array is 0, then it's -1 + if (freeLength == 0) { + return -1; + } + + int newLength = estimatedLength(free.get(i).length); + // more than 1 element: maximum of newly computed length and the previous value + if (freeLength > 1) { + FreeSpace prevSpace = free.get(i - 1); + int fixedLength = estimatedLength(prevSpace.length); + return Math.max(fixedLength, newLength); + } else { + // otherwise, simply new value + return newLength; + } } - public static boolean isPowerOfTwo(int val) { - return (val & (val - 1)) == 0; + public void write(SeekableByteChannel channel) throws IOException { + buffer.write(channel); } - public static int nextPowerOfTwo(int val) { - if (isPowerOfTwo(val)) { - return val; + public static class FreeSpace implements Comparable { + int start; + int length; + + public FreeSpace(int start, int length) { + this.start = start; + this.length = length; + } + + @Override + public int compareTo(FreeSpace o) { + int cval = Integer.compare(length, o.length); + if (cval != 0) return cval; + return Integer.compare(start, o.start); } - int nlz = Integer.numberOfLeadingZeros(val); - return 1 << (32 - nlz); } - public static class FreeSpace { + int wastedBytes() { + return free.stream().mapToInt(f -> f.length).sum(); + } + int numSlots() { + return free.size(); } } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringPtrTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/StringPtrTest.kt similarity index 74% rename from src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringPtrTest.kt rename to src/test/java/com/worksap/nlp/sudachi/dictionary/StringPtrTest.kt index dad27754..6e34f1d4 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringPtrTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/StringPtrTest.kt @@ -1,7 +1,9 @@ -package com.worksap.nlp.sudachi.dictionary.build +package com.worksap.nlp.sudachi.dictionary import org.junit.Test import kotlin.test.assertEquals +import kotlin.test.assertFalse +import kotlin.test.assertTrue class StringPtrTest { @@ -67,4 +69,23 @@ class StringPtrTest { checkConversion(19 + 0b00000011_111111111, 0x07ff_ffff xor ((1 shl 10) - 1)) checkConversion(19 + 0b00000111_111111111, 0x07ff_ffff xor ((1 shl 11) - 1)) } + + @Test + fun isValid() { + assertTrue { StringPtr.isValid(0, 0) } + assertTrue { StringPtr.isValid(1, 0) } + assertTrue { StringPtr.isValid(0, 1) } + assertTrue { StringPtr.isValid(1, 1) } + assertTrue { StringPtr.isValid(0, 19) } + assertTrue { StringPtr.isValid(1, 19) } + assertTrue { StringPtr.isValid(0, 20) } + assertTrue { StringPtr.isValid(1, 20) } + assertTrue { StringPtr.isValid(0, 21) } + assertFalse { StringPtr.isValid(1, 21) } + assertTrue { StringPtr.isValid(2, 21) } + assertTrue { StringPtr.isValid(0, 23) } + assertFalse { StringPtr.isValid(1, 23) } + assertFalse { StringPtr.isValid(2, 23) } + assertTrue { StringPtr.isValid(4, 23) } + } } \ No newline at end of file From 0cab6243085b1b87b2399be62ccd80392bef5fdb Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Thu, 7 Apr 2022 17:50:34 +0900 Subject: [PATCH 05/94] string layout now really works --- .gitignore | 1 + .../dictionary/build/InMemoryChannel.java | 8 ++++ .../build/UnicodeBufferResizeable.java | 7 ++-- .../dictionary/build/UniqueStrings.java | 5 ++- .../sudachi/dictionary/build/WordLayout.java | 16 +++++--- .../dictionary/build/WordLayoutTest.kt | 37 +++++++++++++++++++ 6 files changed, 63 insertions(+), 11 deletions(-) create mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/build/WordLayoutTest.kt diff --git a/.gitignore b/.gitignore index f0c3a2e2..8bc131aa 100644 --- a/.gitignore +++ b/.gitignore @@ -122,6 +122,7 @@ settings.xml /build/ /.gradle/ +/out/ !gradle/wrapper/gradle-wrapper.jar out/ diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java index 52e3dd84..a936b0eb 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java @@ -76,4 +76,12 @@ public boolean isOpen() { public void close() throws IOException { // always open } + + public ByteBuffer buffer() { + ByteBuffer copy = buffer.duplicate(); + copy.position(0); + copy.limit(buffer.position()); + copy.order(ByteOrder.LITTLE_ENDIAN); + return copy; + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java index 545d889c..292fc8bb 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java @@ -25,7 +25,7 @@ public void put(int offset, String data, int start, int end) { } private CharBuffer prepare(int offset, int numChars) { - buffer.position(offset); + buffer.position(offset * 2); int remaining = buffer.remaining(); int byteLength = numChars * 2; while (remaining < byteLength) { @@ -41,8 +41,9 @@ private CharBuffer prepare(int offset, int numChars) { return chars; } - public void write(SeekableByteChannel channel) throws IOException { - buffer.flip(); + public void write(WritableByteChannel channel, int limit) throws IOException { + buffer.position(0); + buffer.limit(limit); channel.write(buffer); buffer.clear(); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStrings.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStrings.java index cb09a22c..06187d89 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStrings.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStrings.java @@ -6,6 +6,7 @@ import java.io.BufferedReader; import java.io.IOException; import java.nio.channels.SeekableByteChannel; +import java.nio.channels.WritableByteChannel; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -42,8 +43,8 @@ private Item process(String str) { int[] offsets = new int[length + 1]; int numOffsets = computeOffsets(str, offsets); + StringPtr ptr = layout.add(str, 0, length); Item full = new Item(str, 0, length); - StringPtr ptr = layout.add(full); full.root = full; full.ptr = ptr; candidates.put(str, full); @@ -88,7 +89,7 @@ public HashMap getStrings() { return strings; } - public void writeCompact(SeekableByteChannel channel) throws IOException { + public void writeCompact(WritableByteChannel channel) throws IOException { layout.write(channel); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java index 2a8d0986..e4a4e08c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java @@ -3,7 +3,7 @@ import com.worksap.nlp.sudachi.dictionary.StringPtr; import java.io.IOException; -import java.nio.channels.SeekableByteChannel; +import java.nio.channels.WritableByteChannel; import java.util.ArrayList; import java.util.Collections; @@ -14,11 +14,15 @@ public class WordLayout { private int pointer; private int maxLength = -1; - public StringPtr add(UniqueStrings.Item item) { - int length = item.getLength(); + public StringPtr add(String string) { + return add(string, 0, string.length()); + } + + public StringPtr add(String string, int start, int end) { + int length = string.length(); int alignment = StringPtr.requiredAlignment(length); int offset = allocate(length, alignment); - buffer.put(offset, item.getData(), item.getStart(), item.getEnd()); + buffer.put(offset, string, start, end); return StringPtr.checked(length, offset); } @@ -116,8 +120,8 @@ private int computeNewMaxLength(int i) { } } - public void write(SeekableByteChannel channel) throws IOException { - buffer.write(channel); + public void write(WritableByteChannel channel) throws IOException { + buffer.write(channel, pointer * 2); } public static class FreeSpace implements Comparable { diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/WordLayoutTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/WordLayoutTest.kt new file mode 100644 index 00000000..d6c3278e --- /dev/null +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/WordLayoutTest.kt @@ -0,0 +1,37 @@ +package com.worksap.nlp.sudachi.dictionary.build + +import com.worksap.nlp.sudachi.dictionary.StringPtr +import java.nio.CharBuffer +import kotlin.test.Test +import kotlin.test.assertEquals + +class WordLayoutTest { + companion object { + fun CharBuffer.read(ptr: StringPtr): String { + return substring(ptr.offset, ptr.offset + ptr.length) + } + } + + + @Test + fun alignmentBasedPlacement() { + val layout = WordLayout() + val p1 = layout.add("0".repeat(25)) + val p2 = layout.add("1".repeat(23)) + val p3 = layout.add("2".repeat(15)) + val p4 = layout.add("3".repeat(3)) + val p5 = layout.add("4".repeat(1)) + val p6 = layout.add("5".repeat(2)) + val chan = InMemoryChannel() + layout.write(chan) + val chars = chan.buffer().asCharBuffer() + assertEquals("0".repeat(25), chars.read(p1)) + assertEquals("1".repeat(23), chars.read(p2)) + assertEquals("2".repeat(15), chars.read(p3)) + assertEquals("3".repeat(3), chars.read(p4)) + assertEquals("4".repeat(1), chars.read(p5)) + assertEquals("5".repeat(2), chars.read(p6)) + assert(p5.offset < p2.offset) + assert(p6.offset < p2.offset) + } +} \ No newline at end of file From 3e991d875ea415c161dbd7ec5c9e9c99c21dcc0f Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Fri, 8 Apr 2022 16:54:42 +0900 Subject: [PATCH 06/94] add documentation for WordLayout --- .../sudachi/dictionary/build/WordLayout.java | 101 +++++++++++++++--- .../dictionary/build/WordLayoutTest.kt | 62 +++++++++++ 2 files changed, 147 insertions(+), 16 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java index e4a4e08c..23a8e3d0 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java @@ -6,7 +6,25 @@ import java.nio.channels.WritableByteChannel; import java.util.ArrayList; import java.util.Collections; - +import java.util.StringJoiner; + +/** + *

+ * Lays out dictionary words so that they will form correct {@link StringPtr} instances. + * That means taking into account the required alignment for strings with larger sizes. + * Aligning strings produces wasted space in form of padding, which is kept track with free list approach. + * + *

+ * The main API is {@link #add(String)} method which should be called for all strings. + * The method should be called for strings sorted in descending order by length, otherwise padding between aligned strings + * would not be utilized correctly. + * The returned {@link StringPtr}s will be correct in any case. + * + *

+ * The current implementation is relatively fast, but can be made even faster by using sorted multiset collection. + * JVM standard library does not have one, so the current implementation sorts free list while keeping track if the sort + * is needed and guarding against relatively expensive checking free lists with additional conditions. + */ public class WordLayout { private final UnicodeBufferResizeable buffer = new UnicodeBufferResizeable(); private final ArrayList free = new ArrayList<>(); @@ -26,7 +44,20 @@ public StringPtr add(String string, int start, int end) { return StringPtr.checked(length, offset); } - private int allocateAligned(int length, int alignment, int start, int end) { + /** + * Allocates a segment of utf-16 code units in a specified block, taking in account requested alignment. + * + * Alignment can skip some space in the beginning of the block because of padding. + * That space will be placed into free lists. + * Nothing will be placed in the free lists if the allocation is not possible. + * + * @param length requested length of segment + * @param alignment requested alignment of segment + * @param start start of the block of memory to use + * @param end end of the block of memory to use + * @return offset of the aligned data or -1 if allocation is impossible + */ + private int allocateInBlock(int length, int alignment, int start, int end) { int requiredAlignment = Math.max(0, alignment - 1); int alignmentStep = 1 << requiredAlignment; int alignmentMask = alignmentStep - 1; @@ -44,11 +75,25 @@ private int allocateAligned(int length, int alignment, int start, int end) { assert padding > 0; free.add(new FreeSpace(start, padding)); freeDirty = true; - maxLength = Math.max(maxLength, padding - (start & alignmentMask)); + int estimated = availableMaxLength(padding); + maxLength = Math.max(maxLength, estimated); } return alignedStart; } + /** + * Allocates a slot of {@code length} bytes, alignment with {@code alignment}. + * It first considers free slots created by previous allocations, if none is valid. + * + * Current implementation is prone to creating "holes" of 1-length, which are almost impossible to fill + * from the usual dictionaries. + * Most emoji take 2 code units and words which are not substrings of another word are usually longer. + * The current implementation wastes ~32k holes in ~42M dictionary, which is ~0.1% of total space. + * + * @param length number of byte + * @param alignment requested alignment + * @return offset in utf-16 code units to the location of the requested block + */ private int allocate(int length, int alignment) { if (length <= maxLength) { if (freeDirty) { @@ -67,14 +112,17 @@ private int allocate(int length, int alignment) { continue; } int end = fs.start + fs.length; - int start = allocateAligned(length, alignment, fs.start, end); + int start = allocateInBlock(length, alignment, fs.start, end); if (start != -1) { int remaining = end - start - length; if (remaining > 0) { fs.start = start + length; fs.length = remaining; freeDirty = true; - maxLength = computeNewMaxLength(i); + // we need to recompute maxLength only if modifying the last (maximum) element in free lists + if (i == numFree - 1) { + maxLength = computeNewMaxLength(i); + } } else { free.remove(i); maxLength = computeNewMaxLength(numFree - 2); @@ -86,21 +134,34 @@ private int allocate(int length, int alignment) { } - int alignedStart = allocateAligned(length, alignment, pointer, Integer.MAX_VALUE); + int alignedStart = allocateInBlock(length, alignment, pointer, Integer.MAX_VALUE); assert alignedStart != -1; pointer = alignedStart + length; return alignedStart; } - private int estimatedLength(int length) { - if (length <= 20) { + /** + * Returns available max length for a hole + * @param length hole length + * @return length of an element which can be allocated using any alignment + */ + private static int availableMaxLength(int length) { + int simple = StringPtr.MAX_SIMPLE_LENGTH + 1; + if (length <= simple) { return length; } - int clz = Integer.numberOfLeadingZeros(length); - return 1 << (31 - clz); + int clz = Integer.numberOfLeadingZeros(length - simple); + int candidateLength = 1 << (31 - clz); + return Math.max(simple, candidateLength + simple); } - private int computeNewMaxLength(int i) { + /** + * Compute new maximum length which can be handled by free lists. + * Should be called if the last element of free lists was updated. + * @param index index in free lists of the element which needs to be considered + * @return new maximum length that can be handled by free lists + */ + private int computeNewMaxLength(int index) { // assumption: free is sorted ascending my length (except i-th item) int freeLength = free.size(); // if new size of free array is 0, then it's -1 @@ -108,11 +169,11 @@ private int computeNewMaxLength(int i) { return -1; } - int newLength = estimatedLength(free.get(i).length); + int newLength = availableMaxLength(free.get(index).length); // more than 1 element: maximum of newly computed length and the previous value if (freeLength > 1) { - FreeSpace prevSpace = free.get(i - 1); - int fixedLength = estimatedLength(prevSpace.length); + FreeSpace prevSpace = free.get(index - 1); + int fixedLength = availableMaxLength(prevSpace.length); return Math.max(fixedLength, newLength); } else { // otherwise, simply new value @@ -135,10 +196,18 @@ public FreeSpace(int start, int length) { @Override public int compareTo(FreeSpace o) { - int cval = Integer.compare(length, o.length); - if (cval != 0) return cval; + int comparison = Integer.compare(length, o.length); + if (comparison != 0) return comparison; return Integer.compare(start, o.start); } + + @Override + public String toString() { + return new StringJoiner(", ", FreeSpace.class.getSimpleName() + "[", "]") + .add("start=" + start) + .add("length=" + length) + .toString(); + } } int wastedBytes() { diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/WordLayoutTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/WordLayoutTest.kt index d6c3278e..1cf108fa 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/WordLayoutTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/WordLayoutTest.kt @@ -4,6 +4,7 @@ import com.worksap.nlp.sudachi.dictionary.StringPtr import java.nio.CharBuffer import kotlin.test.Test import kotlin.test.assertEquals +import kotlin.test.assertNotNull class WordLayoutTest { companion object { @@ -34,4 +35,65 @@ class WordLayoutTest { assert(p5.offset < p2.offset) assert(p6.offset < p2.offset) } + + @Test + fun alignmentPlacedPlacementLarge() { + val layout = WordLayout() + val ptrs = ArrayList() + for (i in 0..499) { + val char = 500 - i + val str = char.toChar().toString().repeat(char) + ptrs.add(layout.add(str)) + } + val chan = InMemoryChannel() + layout.write(chan) + val chars = chan.buffer().asCharBuffer() + for (i in 0..499) { + val char = 500 - i + val expected = char.toChar().toString().repeat(char) + val actual = chars.read(ptrs[i]) + assertEquals(expected, actual) + } + } + + @Test + fun alignmentPlacedPlacementHoles() { + val layout = WordLayout() + val ptrs = ArrayList() + for (i in 0..3) { + val count = 200 - 5 * i + val str = i.toChar().toString().repeat(count) + ptrs.add(layout.add(str)) + } + for (i in 0..20) { + val count = 21 - i + val str = (20 + i).toChar().toString().repeat(count) + ptrs.add(layout.add(str)) + } + val chan = InMemoryChannel() + layout.write(chan) + val chars = chan.buffer().asCharBuffer() + for (i in 0..3) { + val count = 200 - 5 * i + val char = i.toChar() + val expected = char.toString().repeat(count) + val actual = chars.read(ptrs[i]) + assertEquals(expected, actual) + } + for (i in 0..20) { + val count = 21 - i + val char = (20 + i).toChar() + val expected = char.toString().repeat(count) + val actual = chars.read(ptrs[4 + i]) + assertEquals(expected, actual) + } + } + + @Test + fun coverage() { + val layout = WordLayout() + assertEquals(0, layout.wastedBytes()) + assertEquals(0, layout.numSlots()) + assertNotNull(layout.toString()) + } } \ No newline at end of file From 8e9b33ad382efa3069d29cef451f5640e5340125 Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Fri, 6 May 2022 07:18:48 +0900 Subject: [PATCH 07/94] binary dictionary wip --- .../com/worksap/nlp/sudachi/StringUtil.java | 35 ++++++ .../worksap/nlp/sudachi/dictionary/Ints.java | 22 ++++ .../nlp/sudachi/dictionary/StringPtr.java | 9 ++ .../nlp/sudachi/dictionary/WordInfo.java | 47 +++++++ .../nlp/sudachi/dictionary/build/Align.java | 30 +++++ .../sudachi/dictionary/build/CsvLexicon.java | 31 ++++- .../nlp/sudachi/dictionary/build/Lookup2.java | 35 ++++++ .../sudachi/dictionary/build/StringIndex.java | 13 ++ ...{UniqueStrings.java => StringStorage.java} | 9 +- .../build/WordInfoLayoutFixedWidth.java | 97 +++++++++++---- .../nlp/sudachi/dictionary/build/WordRef.java | 116 ++++++++++++++++++ ...queStringsTest.kt => StringStorageTest.kt} | 6 +- 12 files changed, 419 insertions(+), 31 deletions(-) create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/Align.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringIndex.java rename src/main/java/com/worksap/nlp/sudachi/dictionary/build/{UniqueStrings.java => StringStorage.java} (95%) create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java rename src/test/java/com/worksap/nlp/sudachi/dictionary/build/{UniqueStringsTest.kt => StringStorageTest.kt} (87%) diff --git a/src/main/java/com/worksap/nlp/sudachi/StringUtil.java b/src/main/java/com/worksap/nlp/sudachi/StringUtil.java index c5108c79..afef3b88 100644 --- a/src/main/java/com/worksap/nlp/sudachi/StringUtil.java +++ b/src/main/java/com/worksap/nlp/sudachi/StringUtil.java @@ -80,4 +80,39 @@ public static ByteBuffer readAllBytes(InputStream inputStream) throws IOExceptio bbuf.limit(offset); return bbuf; } + + public static int count(CharSequence sequence, char toFind) { + return count(sequence, 0, sequence.length(), toFind); + } + + public static int count(CharSequence sequence, int start, int end, char toFind) { + int count = 0; + for (int i = start; i < end; i++) { + char c = sequence.charAt(i); + if (c == toFind) { + count += 1; + } + } + return count; + } + + public static String readLengthPrefixed(ByteBuffer buffer) { + // implementation: use the fact that CharBuffers are CharSequences + // and the fact that ByteBuffer can be used as CharBuffer + // remember buffer state + int limit = buffer.limit(); + int position = buffer.position(); + // read length + short length = buffer.getShort(position); + // compute new buffer state + int newPosition = position + 2; + buffer.position(newPosition); + buffer.limit(newPosition + length * 2); + // use CharBuffer API + String result = buffer.asCharBuffer().toString(); + // restore previous state + buffer.position(position); + buffer.limit(limit); + return result; + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java index fb3751e9..ff73f3f4 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java @@ -1,11 +1,17 @@ package com.worksap.nlp.sudachi.dictionary; +import java.nio.ByteBuffer; import java.util.Arrays; public class Ints { private int[] data; private int length; + private Ints(int[] data, int size) { + this.data = data; + this.length = size; + } + public Ints(int capacity) { data = new int[capacity]; length = 0; @@ -44,4 +50,20 @@ public void maybeResize(int additional) { } } + public static Ints wrap(int[] array, int size) { + return new Ints(array, size); + } + + public static Ints wrap(int[] array) { + return new Ints(array, array.length); + } + + public static int[] readArray(ByteBuffer buffer, int len) { + int[] result = new int[len]; + for (int i = 0; i < len; ++i) { + result[i] = buffer.getInt(); + } + return result; + } + } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/StringPtr.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/StringPtr.java index c56123d6..b7adb94a 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/StringPtr.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/StringPtr.java @@ -22,6 +22,11 @@ public static StringPtr unsafe(int length, int offset) { } public static StringPtr checked(int length, int offset) { + if (length > MAX_LENGTH) { + throw new IllegalArgumentException( + String.format("Maximum possible length is %d, was requested %d", MAX_LENGTH, length) + ); + } if (!isValid(offset, length)) { throw new IllegalArgumentException( String.format("StringPtr is invalid offset=%08x length=%d alignment=%d", offset, length, requiredAlignment(length))); @@ -116,4 +121,8 @@ public boolean isSubseqValid(int start, int end) { int length = end - start; return isValid(realStart, length); } + + public StringPtr subPtr(int start, int end) { + return StringPtr.checked(end - start, offset + start); + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java index 61df1fb5..f69a30d0 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java @@ -16,6 +16,10 @@ package com.worksap.nlp.sudachi.dictionary; +import com.worksap.nlp.sudachi.StringUtil; + +import java.nio.ByteBuffer; + /** * Informations of the morpheme. * @@ -27,6 +31,7 @@ public class WordInfo { private final String surface; private final short headwordLength; private short posId; + private final int normalizedFormWordId; private final String normalizedForm; private final int dictionaryFormWordId; private final String dictionaryForm; @@ -42,6 +47,7 @@ public WordInfo(String surface, short headwordLength, short posId, String normal this.surface = surface; this.headwordLength = headwordLength; this.posId = posId; + this.normalizedFormWordId = 0; this.normalizedForm = normalizedForm; this.dictionaryFormWordId = dictionaryFormWordId; this.dictionaryForm = dictionaryForm; @@ -73,6 +79,7 @@ public WordInfo(String surface, short headwordLength, short posId, String normal this.surface = surface; this.headwordLength = headwordLength; this.posId = posId; + this.normalizedFormWordId = 0; this.normalizedForm = normalizedForm; this.dictionaryFormWordId = -1; this.dictionaryForm = dictionaryForm; @@ -198,8 +205,48 @@ public int[] getWordStructure() { * Returns the array of the synonym groups. * * @return the synonym group IDs of the morpheme + * @deprecated use {@link #getSynonymGroupIds()}, this method has a typo in its name */ + @Deprecated public int[] getSynonymGoupIds() { return synonymGids; } + + /** + * Returns the array of the synonym groups. + * + * @return the synonym group IDs of the morpheme + */ + public int[] getSynonymGroupIds() { + return synonymGids; + } + + public static WordInfo read(ByteBuffer buffer) { + short leftId = buffer.getShort(); + short rightId = buffer.getShort(); + short cost = buffer.getShort(); + short posId = buffer.getShort(); + int surfacePtr = buffer.getInt(); + int readingPtr = buffer.getInt(); + int normFormPtr = buffer.getInt(); + int dicFormPtr = buffer.getInt(); + short utf8Length = buffer.getShort(); + byte cSplitLen = buffer.get(); + byte bSplitLen = buffer.get(); + byte aSplitLen = buffer.get(); + byte wordStructureLen = buffer.get(); + byte synonymLen = buffer.get(); + byte userDataFlag = buffer.get(); + int[] cSplit = Ints.readArray(buffer, cSplitLen); + int[] bSplit = Ints.readArray(buffer, bSplitLen); + int[] aSplit = Ints.readArray(buffer, aSplitLen); + int[] wordStructure = Ints.readArray(buffer, wordStructureLen); + int[] synonyms = Ints.readArray(buffer, synonymLen); + + String userData = ""; + if (userDataFlag != 0) { + userData = StringUtil.readLengthPrefixed(buffer); + } + throw new IllegalArgumentException(); + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Align.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Align.java new file mode 100644 index 00000000..b20d5fe1 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Align.java @@ -0,0 +1,30 @@ +package com.worksap.nlp.sudachi.dictionary.build; + +public class Align { + + private Align() {} + + public static boolean isPowerOf2(int value) { + return (value & value - 1) == 0; + } + + /** + * Aligns value to the alignment + * @param value value to be aligned + * @param alignment required alignment as a power of two + * @return aligned value, it should be greater or equal than the passed value + */ + public static int align(int value, int alignment) { + assert isPowerOf2(alignment); + // Compute alignment mask, it is the inverse of the mask for the bits that must be 0 for alignment to be correct + // Checking mask is computed as alignment - 1. E.g. 7 for alignment of 8, or 15 for alignment of 16. + // The second one is its inverse. + int mask = -alignment; // same as ~(alignment - 1) + int masked = value & mask; + if (masked == value) { + return value; + } else { + return masked + alignment; + } + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java index ae7a0ed3..a2d82f3e 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java @@ -16,6 +16,7 @@ package com.worksap.nlp.sudachi.dictionary.build; +import com.worksap.nlp.sudachi.StringUtil; import com.worksap.nlp.sudachi.WordId; import com.worksap.nlp.sudachi.dictionary.POS; import com.worksap.nlp.sudachi.dictionary.WordInfo; @@ -27,6 +28,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Objects; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -173,7 +175,7 @@ int wordToId(String text) { } void checkSplitInfoFormat(String info) { - if (info.chars().filter(i -> i == '/').count() + 1 > ARRAY_MAX_LENGTH) { + if (StringUtil.count(info, '/') + 1 > ARRAY_MAX_LENGTH) { throw new IllegalArgumentException("too many units"); } } @@ -280,7 +282,8 @@ public void setLimits(int left, int right) { parameters.setLimits(left, right); } - public static class WordEntry { + public static class WordEntry implements Lookup2.Entry { + int pointer; String headword; WordInfo wordInfo; String aUnitSplitString; @@ -295,7 +298,7 @@ public static class WordEntry { int expectedSize = 0; private int countSplits(String data) { - return (int)data.chars().filter(c -> c == '/').count(); + return StringUtil.count(data, '/'); } public int computeExpectedSize() { @@ -310,9 +313,31 @@ public int computeExpectedSize() { size += countSplits(cUnitSplitString) * 4; size += countSplits(wordStructureString) * 4; size += wordInfo.getSynonymGoupIds().length * 4; + if (userData.length() != 0) { + size += 2 + userData.length() * 2; + } + + size = Align.align(size, 8); expectedSize = size; return size; } + + @Override + public int pointer() { + return pointer; + } + + @Override + public boolean matches(short posId, String reading) { + return wordInfo.getPOSId() == posId && Objects.equals(wordInfo.getReadingForm(), reading); + } + + @Override + public String headword() { + return wordInfo.getSurface(); + } } + + } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java new file mode 100644 index 00000000..5d3e6a51 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java @@ -0,0 +1,35 @@ +package com.worksap.nlp.sudachi.dictionary.build; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class Lookup2 { + public interface Entry { + int pointer(); + boolean matches(short posId, String reading); + String headword(); + } + + public Lookup2(List entries) { + this.entries = entries; + HashMap> result = new HashMap<>(entries.size() * 4 / 3); + for (Entry e : entries) { + List sublist = result.computeIfAbsent(e.headword(), x -> new ArrayList<>()); + sublist.add(e); + } + bySurface = result; + } + + private final List entries; + private final Map> bySurface; + + public Entry byIndex(int index) { + return entries.get(index); + } + + public List byHeadword(String headword) { + return bySurface.get(headword); + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringIndex.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringIndex.java new file mode 100644 index 00000000..53c5dff6 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringIndex.java @@ -0,0 +1,13 @@ +package com.worksap.nlp.sudachi.dictionary.build; + +import com.worksap.nlp.sudachi.dictionary.StringPtr; + +@FunctionalInterface +public interface StringIndex { + /** + * Produces a StringPtr for a String. + * @param data given String + * @return StringPtr + */ + StringPtr resolve(String data); +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStrings.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java similarity index 95% rename from src/main/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStrings.java rename to src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java index 06187d89..811b510f 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStrings.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java @@ -13,7 +13,7 @@ import java.nio.file.StandardOpenOption; import java.util.*; -public class UniqueStrings { +public class StringStorage { private final HashMap strings = new HashMap<>(); private final HashMap candidates = new HashMap<>(); private final WordLayout layout = new WordLayout(); @@ -85,6 +85,11 @@ private int computeOffsets(String str, int[] offsets) { return count; } + public StringPtr resolve(String data) { + Item item = strings.get(data); + return item.root.ptr.subPtr(item.start, item.end); + } + public HashMap getStrings() { return strings; } @@ -138,7 +143,7 @@ public int getLength() { public static void main(String[] args) throws IOException { - UniqueStrings strings = new UniqueStrings(); + StringStorage strings = new StringStorage(); try (BufferedReader reader = Files.newBufferedReader(Paths.get(args[0]))) { CSVParser parser = new CSVParser(reader); List record; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordInfoLayoutFixedWidth.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordInfoLayoutFixedWidth.java index 055489a3..20d3a609 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordInfoLayoutFixedWidth.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordInfoLayoutFixedWidth.java @@ -5,21 +5,25 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.nio.ByteOrder; +import java.util.List; public class WordInfoLayoutFixedWidth { - private final WordIdResolver resolver; - private final ByteBuffer buffer = ByteBuffer.allocate(4 * 1024 * 1024); + private final StringIndex index; + private final WordRef.Parser wordRefParser; + private final Lookup2 lookup; + private final ByteBuffer buffer = ByteBuffer.allocate(512 * 1024); private int position; - private Ints aSplits = new Ints(16); - private Ints bSplits = new Ints(16); - private Ints cSplits = new Ints(16); - private Ints wordStructure = new Ints(16); - private Ints wordOffsets = new Ints(0); + private final Ints aSplits = new Ints(16); + private final Ints bSplits = new Ints(16); + private final Ints cSplits = new Ints(16); + private final Ints wordStructure = new Ints(16); - public WordInfoLayoutFixedWidth(WordIdResolver resolver) { - this.resolver = resolver; + public WordInfoLayoutFixedWidth(Lookup2 resolver, StringIndex index, WordRef.Parser parser) { + this.lookup = resolver; + this.index = index; buffer.order(ByteOrder.LITTLE_ENDIAN); + wordRefParser = parser; } public void process(CsvLexicon.WordEntry entry) { @@ -34,36 +38,70 @@ public int put(CsvLexicon.WordEntry entry) { buffer.putShort(entry.cost); buffer.putShort(entry.wordInfo.getPOSId()); // 8 bytes - buffer.putInt(0); // surfacePtr - buffer.putInt(0); // readingPtr - buffer.putInt(entryPtr); // write normalized entry pointer in second pass - buffer.putInt(entryPtr); // write dictionary form entry pointer in second pass + buffer.putInt(index.resolve(entry.wordInfo.getSurface()).encode()); // surfacePtr + buffer.putInt(index.resolve(entry.wordInfo.getReadingForm()).encode()); // readingPtr + int normFormPtr = wordRefParser.parse(entry.wordInfo.getNormalizedForm()).resolve(lookup); + int dicFormPtr = wordRefParser.parse(entry.wordInfo.getDictionaryForm()).resolve(lookup); + buffer.putInt(normFormPtr); // normalized entry + buffer.putInt(dicFormPtr); // dictionary form // 8 + 16 = 24 bytes - byte aSplitLen = resolver.parseList(entry.aUnitSplitString, aSplits); - byte bSplitLen = resolver.parseList(entry.bUnitSplitString, bSplits); - byte cSplitLen = resolver.parseList(entry.cUnitSplitString, cSplits); - byte wordStructureLen = resolver.parseList(entry.wordStructureString, wordStructure); - byte synonymLen = (byte) entry.wordInfo.getSynonymGoupIds().length; + byte aSplitLen = parseList(entry.aUnitSplitString, aSplits); + byte bSplitLen = parseList(entry.bUnitSplitString, bSplits); + byte cSplitLen = parseList(entry.cUnitSplitString, cSplits); + byte wordStructureLen = parseList(entry.wordStructureString, wordStructure); + byte synonymLen = (byte) entry.wordInfo.getSynonymGroupIds().length; buffer.putShort(entry.surfaceUtf8Length); - buffer.put(entry.userData.length() != 0 ? (byte)0 : (byte)1); - buffer.put(synonymLen); buffer.put(cSplitLen); buffer.put(bSplitLen); buffer.put(aSplitLen); buffer.put(wordStructureLen); + buffer.put(synonymLen); + int userDataLength = entry.userData.length(); + buffer.put(userDataLength != 0 ? (byte)0 : (byte)1); // 24 + 8 = 32 bytes + putInts(cSplits, cSplitLen); + putInts(bSplits, bSplitLen); + putInts(aSplits, aSplitLen); + putInts(wordStructure, wordStructureLen); + putInts(Ints.wrap(entry.wordInfo.getSynonymGroupIds()), synonymLen); + + if (userDataLength != 0) { + buffer.putShort((short) userDataLength); + String userData = entry.userData; + for (int i = 0; i < userDataLength; ++i) { + buffer.putShort((short)userData.charAt(i)); + } + } + // align to 8 boundary int currentPosition = buffer.position(); - if ((currentPosition & 0x7) != 0) { - buffer.position((currentPosition & 0xffff_fff8) + 8); - } + buffer.position(Align.align(currentPosition, 8)); return entryPtr; } + private void putInts(Ints ints, int len) { + for (int i = 0; i < len; ++i) { + buffer.putInt(ints.get(i)); + } + } + + public void fillPointers(ByteBuffer data, List entries, Lookup2 lookup) { + for (int i = 0; i < entries.size(); i++) { + CsvLexicon.WordEntry entry = entries.get(i); + int offset = entry.pointer << 3; + data.position(offset + 8); + + data.putInt(index.resolve(entry.wordInfo.getSurface()).encode()); + data.putInt(index.resolve(entry.wordInfo.getReadingForm()).encode()); + data.putInt(entry.wordInfo.getDictionaryFormWordId()); + //data.putInt(entry.) + } + } + public T consume(IOConsumer consumer) throws IOException { position += buffer.position(); buffer.flip(); @@ -71,4 +109,17 @@ public T consume(IOConsumer consumer) throws IOException { buffer.clear(); return result; } + + byte parseList(String data, Ints result) { + String[] parts = data.split("/"); + if (parts.length > Byte.MAX_VALUE) { + throw new IllegalArgumentException("reference list contained more than 127 entries: " + data); + } + result.clear(); + for (String part : parts) { + WordRef ref = wordRefParser.parse(part); + result.append(ref.resolve(lookup)); + } + return (byte) parts.length; + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java new file mode 100644 index 00000000..6d5304a3 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java @@ -0,0 +1,116 @@ +package com.worksap.nlp.sudachi.dictionary.build; + +import com.worksap.nlp.sudachi.StringUtil; +import com.worksap.nlp.sudachi.dictionary.POS; + +import java.util.Arrays; +import java.util.List; +import java.util.regex.Pattern; + +import static com.worksap.nlp.sudachi.dictionary.build.CsvLexicon.unescape; + +/** + * Reference to a word in the CSV dictionary. + */ +public abstract class WordRef { + public abstract int resolve(Lookup2 resolver); + + public static final class LineNo extends WordRef { + private final int line; + + public LineNo(int line) { + this.line = line; + } + + public int getLine() { + return line; + } + + @Override + public int resolve(Lookup2 resolver) { + return resolver.byIndex(line).pointer(); + } + } + public static final class Headword extends WordRef { + private final String headword; + + public Headword(String headword) { + this.headword = headword; + } + + public String getHeadword() { + return headword; + } + + @Override + public int resolve(Lookup2 resolver) { + List entries = resolver.byHeadword(headword); + return entries.get(0).pointer(); + } + } + + public static final class Triple extends WordRef { + private final String headword; + private final short posId; + private final String reading; + + public Triple(String headword, short posId, String reading) { + this.headword = headword; + this.posId = posId; + this.reading = reading; + } + + public String getHeadword() { + return headword; + } + + public short getPosId() { + return posId; + } + + public String getReading() { + return reading; + } + + @Override + public int resolve(Lookup2 resolver) { + List entries = resolver.byHeadword(headword); + for (Lookup2.Entry entry: entries) { + if (entry.matches(posId, reading)) { + return entry.pointer(); + } + } + return -1; + } + } + + private static final Pattern NUMERIC_RE = Pattern.compile("^U?\\d+$"); + + public static class Parser { + private final POSTable posTable; + + public Parser(POSTable posTable) { + this.posTable = posTable; + } + + public WordRef parse(String text) { + if (NUMERIC_RE.matcher(text).matches()) { + int offset = text.charAt(0) == 'U' ? 1: 0; + int lineNum = Integer.parseInt(text.substring(offset)); + return new LineNo(lineNum); + } + + if (StringUtil.count(text, ',') == 7) { + String[] cols = text.split(",", 8); + String headword = unescape(cols[0]); + POS pos = new POS(Arrays.copyOfRange(cols, 1, 7)); + short posId = posTable.getId(pos); + String reading = unescape(cols[7]); + return new Triple(headword, posId, reading); + } + + return new Headword(text); + } + + } +} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStringsTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringStorageTest.kt similarity index 87% rename from src/test/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStringsTest.kt rename to src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringStorageTest.kt index 75714f77..e7ef37b2 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UniqueStringsTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringStorageTest.kt @@ -3,11 +3,11 @@ package com.worksap.nlp.sudachi.dictionary.build import kotlin.test.Test import kotlin.test.assertEquals -class UniqueStringsTest { +class StringStorageTest { @Test fun simple() { - val strs = UniqueStrings() + val strs = StringStorage() strs.add("test") strs.add("es") strs.compile() @@ -19,7 +19,7 @@ class UniqueStringsTest { @Test fun oneChar() { - val strs = UniqueStrings() + val strs = StringStorage() strs.add("x") strs.add("y") strs.compile() From 7750eb8a82f7f350675be858197dbca23c07d21c Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Fri, 2 Sep 2022 20:30:16 +0900 Subject: [PATCH 08/94] reading raw csv dictionary --- .../nlp/sudachi/dictionary/CSVParser.java | 6 + .../nlp/sudachi/dictionary/CategoryType.java | 8 +- .../worksap/nlp/sudachi/dictionary/Ints.java | 21 ++ .../nlp/sudachi/dictionary/StringPtr.java | 51 ++++- .../nlp/sudachi/dictionary/WordInfo.java | 3 +- .../nlp/sudachi/dictionary/build/Align.java | 42 ++-- .../dictionary/build/CsvFieldException.java | 23 +++ .../sudachi/dictionary/build/CsvLexicon.java | 110 +---------- .../sudachi/dictionary/build/DicBuilder.java | 2 +- .../sudachi/dictionary/build/DicBuilder2.java | 46 +++++ .../sudachi/dictionary/build/IOConsumer.java | 19 +- .../dictionary/build/InMemoryChannel.java | 20 +- .../nlp/sudachi/dictionary/build/Lookup2.java | 18 ++ .../sudachi/dictionary/build/Progress.java | 3 + .../sudachi/dictionary/build/RawLexicon.java | 69 +++++++ .../dictionary/build/RawLexiconReader.java | 167 ++++++++++++++++ .../dictionary/build/RawWordEntry.java | 95 +++++++++ .../dictionary/build/ResizableBuffer.java | 83 ++++++++ .../sudachi/dictionary/build/StringIndex.java | 20 +- .../dictionary/build/StringStorage.java | 27 ++- .../sudachi/dictionary/build/Unescape.java | 63 ++++++ .../dictionary/build/UnicodeBuffer.java | 16 ++ .../build/UnicodeBufferResizeable.java | 44 ++--- .../dictionary/build/WordIdResolver.java | 2 + .../build/WordInfoLayoutFixedWidth.java | 70 ++++--- .../sudachi/dictionary/build/WordLayout.java | 105 ++++++---- .../sudachi/dictionary/build/WordLookup.java | 6 +- .../nlp/sudachi/dictionary/build/WordRef.java | 47 ++++- .../com/worksap/nlp/sudachi/TestDictionary.kt | 1 - .../nlp/sudachi/dictionary/StringPtrTest.kt | 165 ++++++++-------- .../nlp/sudachi/dictionary/build/AlignTest.kt | 52 +++++ .../dictionary/build/CsvLexiconTest.kt | 18 +- .../dictionary/build/RawLexiconReaderTest.kt | 76 ++++++++ .../dictionary/build/StringStorageTest.kt | 66 ++++--- .../sudachi/dictionary/build/SystemDicTest.kt | 1 + .../sudachi/dictionary/build/UserDicTest.kt | 9 +- .../dictionary/build/WordLayoutTest.kt | 181 ++++++++++-------- .../java/com/worksap/nlp/sudachi/resources.kt | 28 +++ .../sudachi/dictionary/build/headers-all.csv | 2 + .../sudachi/dictionary/build/legacy-full.csv | 1 + .../dictionary/build/legacy-minimum.csv | 1 + 41 files changed, 1353 insertions(+), 434 deletions(-) create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvFieldException.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder2.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/ResizableBuffer.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/Unescape.java create mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/build/AlignTest.kt create mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt create mode 100644 src/test/java/com/worksap/nlp/sudachi/resources.kt create mode 100644 src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv create mode 100644 src/test/resources/com/worksap/nlp/sudachi/dictionary/build/legacy-full.csv create mode 100644 src/test/resources/com/worksap/nlp/sudachi/dictionary/build/legacy-minimum.csv diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/CSVParser.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/CSVParser.java index 440d694e..bdc75acb 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/CSVParser.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/CSVParser.java @@ -57,6 +57,7 @@ enum Type { private BufferedReader reader; private Deque tokenBuffer = new ArrayDeque<>(); private boolean hasNextField = false; + private int row = -1; public CSVParser(Reader reader) { this.reader = new BufferedReader(reader); @@ -73,6 +74,7 @@ public List getNextRecord() throws IOException { String field; while ((field = getField()) != null) { if (field.equals("\n")) { + row += 1; return record; } record.add(field); @@ -80,6 +82,10 @@ public List getNextRecord() throws IOException { return null; } + public int getRow() { + return row; + } + private String getField() throws IOException { if (hasNextField) { return getNextField(); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/CategoryType.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/CategoryType.java index 9cc75eb1..85b64999 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/CategoryType.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/CategoryType.java @@ -17,18 +17,16 @@ package com.worksap.nlp.sudachi.dictionary; /** - * Categories of characters. - * - * These categories are used in the + * Categories of characters. These categories are used in the * {@link com.worksap.nlp.sudachi.OovProviderPlugin} and * {@link com.worksap.nlp.sudachi.PathRewritePlugin}. * *

- * You can defined the range of each category in the file which specified + * You can define the range of each category in the file which specified * "characterDefinitionFile" of the settings. */ public enum CategoryType { - /** The fall back category. */ + /** The fallback category. */ DEFAULT(1), /** White spaces. */ SPACE(1 << 1), diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java index ff73f3f4..05df687a 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi.dictionary; import java.nio.ByteBuffer; @@ -58,7 +74,12 @@ public static Ints wrap(int[] array) { return new Ints(array, array.length); } + private static final int[] EMPTY_ARRAY = new int[0]; + public static int[] readArray(ByteBuffer buffer, int len) { + if (len == 0) { + return EMPTY_ARRAY; + } int[] result = new int[len]; for (int i = 0; i < len; ++i) { result[i] = buffer.getInt(); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/StringPtr.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/StringPtr.java index b7adb94a..7c30053e 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/StringPtr.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/StringPtr.java @@ -1,8 +1,30 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi.dictionary; import java.util.Objects; import java.util.StringJoiner; +/** + * Pointer to a string in the string storage. Consists of offset and length + * compressed in a single int value.
+ * Length can be stored from 5 (max value 19) bits to 19 (max value 4095 + 19). + * Remaining bits are offset which are aligned with a difficult. + */ public class StringPtr { public static final int MAX_LENGTH_BITS = 12; public static final int BASE_OFFSET = 32 - 5; @@ -17,6 +39,16 @@ private StringPtr(int length, int offset) { this.offset = offset; } + /** + * Create a new {@link StringPtr} without any runtime checks. Use + * {@link #isValid(int, int)} to check validity. + * + * @param length + * length of string + * @param offset + * offset of string + * @return StringPtr object, possibly invalid + */ public static StringPtr unsafe(int length, int offset) { return new StringPtr(length, offset); } @@ -24,12 +56,11 @@ public static StringPtr unsafe(int length, int offset) { public static StringPtr checked(int length, int offset) { if (length > MAX_LENGTH) { throw new IllegalArgumentException( - String.format("Maximum possible length is %d, was requested %d", MAX_LENGTH, length) - ); + String.format("Maximum possible length is %d, was requested %d", MAX_LENGTH, length)); } if (!isValid(offset, length)) { - throw new IllegalArgumentException( - String.format("StringPtr is invalid offset=%08x length=%d alignment=%d", offset, length, requiredAlignment(length))); + throw new IllegalArgumentException(String.format("StringPtr is invalid offset=%08x length=%d alignment=%d", + offset, length, requiredAlignment(length))); } return unsafe(length, offset); } @@ -97,8 +128,10 @@ public int encode() { @Override public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; StringPtr stringPtr = (StringPtr) o; return length == stringPtr.length && offset == stringPtr.offset; } @@ -110,10 +143,8 @@ public int hashCode() { @Override public String toString() { - return new StringJoiner(", ", StringPtr.class.getSimpleName() + "[", "]") - .add("length=" + length) - .add("offset=" + offset) - .toString(); + return new StringJoiner(", ", StringPtr.class.getSimpleName() + "[", "]").add("length=" + length) + .add("offset=" + offset).add(String.format("encoded=%08x", encode())).toString(); } public boolean isSubseqValid(int start, int end) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java index f69a30d0..f945d64d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java @@ -205,7 +205,8 @@ public int[] getWordStructure() { * Returns the array of the synonym groups. * * @return the synonym group IDs of the morpheme - * @deprecated use {@link #getSynonymGroupIds()}, this method has a typo in its name + * @deprecated use {@link #getSynonymGroupIds()}, this method has a typo in its + * name */ @Deprecated public int[] getSynonymGoupIds() { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Align.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Align.java index b20d5fe1..2368ac35 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Align.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Align.java @@ -1,8 +1,25 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi.dictionary.build; public class Align { - private Align() {} + private Align() { + } public static boolean isPowerOf2(int value) { return (value & value - 1) == 0; @@ -10,21 +27,22 @@ public static boolean isPowerOf2(int value) { /** * Aligns value to the alignment - * @param value value to be aligned - * @param alignment required alignment as a power of two + * + * @param value + * value to be aligned + * @param alignment + * required alignment as a power of two * @return aligned value, it should be greater or equal than the passed value */ public static int align(int value, int alignment) { assert isPowerOf2(alignment); - // Compute alignment mask, it is the inverse of the mask for the bits that must be 0 for alignment to be correct - // Checking mask is computed as alignment - 1. E.g. 7 for alignment of 8, or 15 for alignment of 16. + // Compute alignment mask, it is the inverse of the mask for the bits that must + // be 0 for alignment to be correct + // Checking mask is computed as alignment - 1. E.g. 7 for alignment of 8, or 15 + // for alignment of 16. // The second one is its inverse. - int mask = -alignment; // same as ~(alignment - 1) - int masked = value & mask; - if (masked == value) { - return value; - } else { - return masked + alignment; - } + int bits = alignment - 1; + int mask = ~bits; + return (value + bits) & mask; } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvFieldException.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvFieldException.java new file mode 100644 index 00000000..cad55231 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvFieldException.java @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build; + +public class CsvFieldException extends IllegalArgumentException { + public CsvFieldException(String s) { + super(s); + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java index a2d82f3e..ed9834ea 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java @@ -28,8 +28,6 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import java.util.Objects; -import java.util.regex.Matcher; import java.util.regex.Pattern; public class CsvLexicon implements WriteDictionary { @@ -39,7 +37,7 @@ public class CsvLexicon implements WriteDictionary { private static final Pattern PATTERN_ID = Pattern.compile("U?\\d+"); private final Parameters parameters = new Parameters(); private final POSTable posTable; - private final List entries = new ArrayList<>(); + private final List entries = new ArrayList<>(); private WordIdResolver widResolver = new WordLookup.Noop(); public CsvLexicon(POSTable pos) { @@ -50,46 +48,16 @@ public void setResolver(WordIdResolver widResolver) { this.widResolver = widResolver; } - /** - * Resolve unicode escape sequences in the string - *

- * Sequences are defined to be \\u0000-\\uFFFF: exactly four hexadecimal - * characters preceded by \\u \\u{...}: a correct unicode character inside - * brackets - * - * @param text - * to to resolve sequences - * @return string with unicode escapes resolved - */ - public static String unescape(String text) { - Matcher m = unicodeLiteral.matcher(text); - if (!m.find()) { - return text; - } - - StringBuffer sb = new StringBuffer(); - m.reset(); - while (m.find()) { - String u = m.group(1); - if (u.startsWith("{")) { - u = u.substring(1, u.length() - 1); - } - m.appendReplacement(sb, new String(Character.toChars(Integer.parseInt(u, 16)))); - } - m.appendTail(sb); - return sb.toString(); - } - - public List getEntries() { + public List getEntries() { return entries; } - WordEntry parseLine(List cols) { + RawWordEntry parseLine(List cols) { if (cols.size() < MIN_REQUIRED_NUMBER_OF_COLUMNS) { throw new IllegalArgumentException("invalid format"); } for (int i = 0; i < 15; i++) { - cols.set(i, unescape(cols.get(i))); + cols.set(i, Unescape.unescape(cols.get(i))); } if (cols.get(0).getBytes(StandardCharsets.UTF_8).length > DicBuffer.MAX_STRING @@ -102,7 +70,7 @@ WordEntry parseLine(List cols) { throw new IllegalArgumentException("headword is empty"); } - WordEntry entry = new WordEntry(); + RawWordEntry entry = new RawWordEntry(); // headword for trie if (!cols.get(1).equals("-1")) { @@ -167,10 +135,10 @@ int wordToId(String text) { if (cols.length < 8) { throw new IllegalArgumentException("too few columns"); } - String headword = unescape(cols[0]); + String headword = Unescape.unescape(cols[0]); POS pos = new POS(Arrays.copyOfRange(cols, 1, 7)); short posId = posTable.getId(pos); - String reading = unescape(cols[7]); + String reading = Unescape.unescape(cols[7]); return widResolver.lookup(headword, posId, reading); } @@ -243,7 +211,7 @@ public void writeTo(ModelOutput output) throws IOException { int offset = (int) output.position(); int numEntries = entries.size(); for (int i = 0; i < numEntries; ++i) { - WordEntry entry = entries.get(i); + RawWordEntry entry = entries.get(i); if (buffer.wontFit(16 * 1024)) { offset += buffer.consume(output::write); } @@ -259,7 +227,7 @@ public void writeTo(ModelOutput output) throws IOException { buffer.putInts(parseSplitInfo(entry.aUnitSplitString)); buffer.putInts(parseSplitInfo(entry.bUnitSplitString)); buffer.putInts(parseSplitInfo(entry.wordStructureString)); - buffer.putInts(wi.getSynonymGoupIds()); + buffer.putInts(wi.getSynonymGroupIds()); output.progress(i, numEntries); } @@ -272,7 +240,7 @@ public void writeTo(ModelOutput output) throws IOException { output.position(pos); } - public int addEntry(WordEntry e) { + public int addEntry(RawWordEntry e) { int id = entries.size(); entries.add(e); return id; @@ -282,62 +250,4 @@ public void setLimits(int left, int right) { parameters.setLimits(left, right); } - public static class WordEntry implements Lookup2.Entry { - int pointer; - String headword; - WordInfo wordInfo; - String aUnitSplitString; - String bUnitSplitString; - String cUnitSplitString; - String userData; - String wordStructureString; - short leftId; - short rightId; - short cost; - short surfaceUtf8Length; - int expectedSize = 0; - - private int countSplits(String data) { - return StringUtil.count(data, '/'); - } - - public int computeExpectedSize() { - if (expectedSize != 0) { - return expectedSize; - } - - int size = 32; - - size += countSplits(aUnitSplitString) * 4; - size += countSplits(bUnitSplitString) * 4; - size += countSplits(cUnitSplitString) * 4; - size += countSplits(wordStructureString) * 4; - size += wordInfo.getSynonymGoupIds().length * 4; - if (userData.length() != 0) { - size += 2 + userData.length() * 2; - } - - size = Align.align(size, 8); - - expectedSize = size; - return size; - } - - @Override - public int pointer() { - return pointer; - } - - @Override - public boolean matches(short posId, String reading) { - return wordInfo.getPOSId() == posId && Objects.equals(wordInfo.getReadingForm(), reading); - } - - @Override - public String headword() { - return wordInfo.getSurface(); - } - } - - } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java index 187398ef..ec8c1342 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java @@ -115,7 +115,7 @@ public T lexiconImpl(String name, InputStream data, long size) throws IOExceptio if (fields == null) break; try { - CsvLexicon.WordEntry e = lexicon.parseLine(fields); + RawWordEntry e = lexicon.parseLine(fields); int wordId = lexicon.addEntry(e); if (e.headword != null) { index.add(e.headword, wordId); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder2.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder2.java new file mode 100644 index 00000000..973f2e00 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder2.java @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build; + +import java.io.IOException; +import java.io.InputStream; +import java.util.function.Function; +import java.util.function.Supplier; + +public class DicBuilder2 { + private DicBuilder2() { + // no instances + } + + public static class Base> { + protected final POSTable pos = new POSTable(); + protected final ConnectionMatrix connection = new ConnectionMatrix(); + protected final Index index = new Index(); + protected Progress progress = Progress.NOOP; + + @SuppressWarnings("unchecked") + private T self() { + return (T) this; + } + + public T lexicon(String name, Supplier input, long size) throws IOException { + + return self(); + } + } + +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOConsumer.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOConsumer.java index 46edc5d9..6d9e062b 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOConsumer.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOConsumer.java @@ -1,10 +1,27 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi.dictionary.build; import java.io.IOException; import java.nio.ByteBuffer; /** - * A version of {@link java.util.function.Consumer} which allows throwing IOException + * A version of {@link java.util.function.Consumer} which allows throwing + * IOException */ @FunctionalInterface public interface IOConsumer { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java index a936b0eb..2cef9ba3 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi.dictionary.build; import java.io.IOException; @@ -51,7 +67,7 @@ public long position() throws IOException { @Override public SeekableByteChannel position(long newPosition) throws IOException { assert newPosition < Integer.MAX_VALUE; - buffer.position((int)newPosition); + buffer.position((int) newPosition); return this; } @@ -63,7 +79,7 @@ public long size() throws IOException { @Override public SeekableByteChannel truncate(long size) { assert size < Integer.MAX_VALUE; - buffer.limit((int)size); + buffer.limit((int) size); return this; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java index 5d3e6a51..56130b68 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi.dictionary.build; import java.util.ArrayList; @@ -8,7 +24,9 @@ public class Lookup2 { public interface Entry { int pointer(); + boolean matches(short posId, String reading); + String headword(); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java index 98765e6f..abeb91af 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java @@ -25,6 +25,9 @@ public class Progress { private float currentProgress; private long lastUpdate; + public static final Progress NOOP = new Progress(1, progress -> { + }); + public Progress(int maxUpdates, Callback callback) { this.maxUpdates = maxUpdates; this.callback = callback; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java new file mode 100644 index 00000000..2d590d09 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build; + +import com.worksap.nlp.sudachi.dictionary.CSVParser; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +public class RawLexicon { + private static final long MAX_OFFSET = Integer.MAX_VALUE * 8L; + private final StringStorage strings = new StringStorage(); + private final List entries = new ArrayList<>(); + + private long offset = 0; + + public void read(InputStream data, POSTable posTable) throws IOException { + read(new InputStreamReader(data, StandardCharsets.UTF_8), posTable); + } + + public void read(Reader data, POSTable posTable) throws IOException { + CSVParser parser = new CSVParser(data); + RawLexiconReader reader = new RawLexiconReader(parser, posTable); + + long offset = this.offset; + RawWordEntry entry; + while ((entry = reader.nextEntry()) != null) { + strings.add(entry.headword); + strings.add(entry.reading); + entries.add(entry); + entry.pointer = pointer(offset); + offset += entry.computeExpectedSize(); + checkOffset(offset); + } + this.offset = offset; + } + + public static int pointer(long offset) { + return (int) (offset >>> 3); + } + + public void checkOffset(long offset) { + if ((offset & 0x7) != 0) { + throw new IllegalArgumentException("offset is not aligned, should not happen"); + } + if (offset > MAX_OFFSET) { + throw new IllegalArgumentException("passed dictionary is too large, Sudachi can't handle it"); + } + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java new file mode 100644 index 00000000..7fa238e9 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build; + +import com.worksap.nlp.sudachi.dictionary.CSVParser; +import com.worksap.nlp.sudachi.dictionary.POS; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.StringJoiner; +import java.util.regex.Pattern; + +public class RawLexiconReader { + + /** + * Enum order is in-csv order. If a header is present, fields will be reordered + */ + public enum Column { + Surface(true), LeftId(true), RightId(true), Cost(true), Writing(false), Pos1(true), Pos2(true), Pos3( + true), Pos4(true), Pos5(true), Pos6(true), ReadingForm(true), NormalizedForm(true), DictionaryForm( + true), Mode(true), SplitA(true), SplitB( + true), WordStructure(true), SynonymGroups(false), SplitC(false), UserData(false); + + private final boolean required; + + Column(boolean required) { + this.required = required; + } + } + + private List cachedRecord; + private int[] mapping; + private final CSVParser parser; + private final POSTable posTable; + + public RawLexiconReader(CSVParser parser, POSTable pos) throws IOException { + this.parser = parser; + this.posTable = pos; + resolveColumnLayout(); + } + + private static final Pattern INTEGER_REGEX = Pattern.compile("^-?\\d+$"); + + private void resolveColumnLayout() throws IOException { + List record = parser.getNextRecord(); + + String leftId = record.get(Column.LeftId.ordinal()); + if (INTEGER_REGEX.matcher(leftId).matches()) { + this.cachedRecord = record; + return; + } + + List remaining = new ArrayList<>(Arrays.asList(Column.values())); + int[] mapping = new int[remaining.size()]; + Arrays.fill(mapping, -1); + + outer: for (int fieldId = 0; fieldId < record.size(); ++fieldId) { + String field = record.get(fieldId).replaceAll("_", ""); + for (int colId = 0; colId < record.size(); ++colId) { + Column col = remaining.get(colId); + if (col.name().equalsIgnoreCase(field)) { + mapping[col.ordinal()] = fieldId; + remaining.remove(colId); + continue outer; + } + } + throw new IllegalArgumentException(String.format("column [%s] is not recognized", field)); + } + + for (Column column : remaining) { + if (column.required) { + StringJoiner joiner = new StringJoiner(", ", "required columns [", "] were not present in the header"); + remaining.stream().filter(c -> c.required).forEach(c -> joiner.add(c.name())); + throw new IllegalArgumentException(joiner.toString()); + } + } + + this.mapping = mapping; + } + + private String get(List data, Column column, boolean unescape) { + int index = column.ordinal(); + if (mapping != null) { + index = mapping[index]; + } + if (index < 0 || index >= data.size()) { + if (column.required) { + throw new CsvFieldException( + String.format("column [%s] (index=%d) was not present", column.name(), index)); + } else { + return ""; + } + } + String s = data.get(index); + if (unescape) { + return Unescape.unescape(s); + } else { + return s; + } + } + + private short getShort(List data, Column column) { + String value = get(data, column, false); + try { + return Short.parseShort(value); + } catch (NumberFormatException e) { + throw new CsvFieldException( + String.format("failed to parse '%s' as a short value in column: %s", value, column.name())); + } + } + + private RawWordEntry convertEntry(List data) { + RawWordEntry entry = new RawWordEntry(); + entry.headword = get(data, Column.Surface, true); + entry.leftId = getShort(data, Column.LeftId); + entry.rightId = getShort(data, Column.RightId); + entry.cost = getShort(data, Column.Cost); + + entry.reading = get(data, Column.ReadingForm, true); + entry.dictionaryFormRef = get(data, Column.DictionaryForm, false); + entry.normalizedFormRef = get(data, Column.NormalizedForm, false); + + POS pos = new POS(get(data, Column.Pos1, true), get(data, Column.Pos2, true), get(data, Column.Pos3, true), + get(data, Column.Pos4, true), get(data, Column.Pos5, true), get(data, Column.Pos6, true)); + + entry.posId = posTable.getId(pos); + + entry.mode = get(data, Column.Mode, false); + entry.aUnitSplitString = get(data, Column.SplitA, false); + entry.bUnitSplitString = get(data, Column.SplitB, false); + entry.cUnitSplitString = get(data, Column.SplitC, false); + entry.wordStructureString = get(data, Column.WordStructure, false); + entry.synonymGroups = get(data, Column.SynonymGroups, false); + entry.userData = get(data, Column.UserData, true); + + return entry; + } + + public RawWordEntry nextEntry() throws IOException { + List record = cachedRecord; + if (record != null) { + cachedRecord = null; + } else { + record = parser.getNextRecord(); + } + if (record == null) { + return null; + } + return convertEntry(record); + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java new file mode 100644 index 00000000..e4bf9f29 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build; + +import com.worksap.nlp.sudachi.StringUtil; +import com.worksap.nlp.sudachi.dictionary.WordInfo; + +import java.util.Objects; + +@SuppressWarnings("jol") +public class RawWordEntry implements Lookup2.Entry { + int pointer; + String headword; + String reading; + String normalizedFormRef; + String dictionaryFormRef; + WordInfo wordInfo; + String aUnitSplitString; + String bUnitSplitString; + String cUnitSplitString; + String wordStructureString; + String synonymGroups; + String userData; + String mode; + short leftId; + short rightId; + short cost; + short surfaceUtf8Length; + int expectedSize = 0; + short posId; + + private int countSplits(String data) { + return StringUtil.count(data, '/'); + } + + public int computeExpectedSize() { + if (expectedSize != 0) { + return expectedSize; + } + + int size = 32; + + size += countSplits(aUnitSplitString) * 4; + size += countSplits(bUnitSplitString) * 4; + size += countSplits(cUnitSplitString) * 4; + size += countSplits(wordStructureString) * 4; + size += wordInfo.getSynonymGroupIds().length * 4; + if (userData.length() != 0) { + size += 2 + userData.length() * 2; + } + + size = Align.align(size, 8); + + expectedSize = size; + return size; + } + + /** + * Entries with negative leftId are not indexed + * + * @return true if the word should be present in the trie index + */ + public boolean shouldBeIndexed() { + return leftId >= 0; + } + + @Override + public int pointer() { + return pointer; + } + + @Override + public boolean matches(short posId, String reading) { + return wordInfo.getPOSId() == posId && Objects.equals(wordInfo.getReadingForm(), reading); + } + + @Override + public String headword() { + return wordInfo.getSurface(); + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ResizableBuffer.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ResizableBuffer.java new file mode 100644 index 00000000..38285dba --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ResizableBuffer.java @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.WritableByteChannel; + +public class ResizableBuffer { + private ByteBuffer buffer; + + public ResizableBuffer(int capacity) { + ByteBuffer buf = ByteBuffer.wrap(new byte[capacity]); + buf.order(ByteOrder.LITTLE_ENDIAN); + buffer = buf; + } + + public ByteBuffer prepare(int additional) { + ByteBuffer buf = buffer; + if (buf.remaining() >= additional) { + return buf; + } else { + return grow(additional); + } + } + + public ByteBuffer prepare(int offset, int size) { + ByteBuffer buf = buffer; + int capacity = buf.capacity(); + if (capacity < offset + size) { + buf = grow(offset + size - capacity); + } + ByteBuffer duplicate = buf.duplicate(); + duplicate.order(ByteOrder.LITTLE_ENDIAN); + duplicate.position(offset); + buf.position(offset + size); + return duplicate; + } + + private ByteBuffer grow(int additional) { + ByteBuffer current = buffer; + int newSize = Math.max(current.capacity() * 2, current.capacity() + additional); + ByteBuffer fresh = ByteBuffer.wrap(new byte[newSize]); + fresh.order(ByteOrder.LITTLE_ENDIAN); + current.flip(); + fresh.put(current); + buffer = fresh; + return fresh; + } + + public ByteBuffer getBuffer() { + return buffer; + } + + public void write(WritableByteChannel channel, int start, int end) throws IOException { + ByteBuffer buf = buffer; + int pos = buf.position(); + int limit = buf.limit(); + try { + buf.position(start); + buf.limit(end); + channel.write(buf); + } finally { + buf.position(pos); + buf.limit(limit); + } + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringIndex.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringIndex.java index 53c5dff6..ca5f0642 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringIndex.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringIndex.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi.dictionary.build; import com.worksap.nlp.sudachi.dictionary.StringPtr; @@ -6,7 +22,9 @@ public interface StringIndex { /** * Produces a StringPtr for a String. - * @param data given String + * + * @param data + * given String * @return StringPtr */ StringPtr resolve(String data); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java index 811b510f..2f0171de 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi.dictionary.build; import com.worksap.nlp.sudachi.dictionary.CSVParser; @@ -27,7 +43,7 @@ void compile() { candidates.put("", new Item("", 0, 0)); List collect = new ArrayList<>(strings.keySet()); collect.sort(Comparator.comparingInt(String::length).reversed().thenComparing(String::compareTo)); - for (String str: collect) { + for (String str : collect) { strings.put(str, process(str)); } candidates.clear(); @@ -100,7 +116,7 @@ public void writeCompact(WritableByteChannel channel) throws IOException { public void writeLengthPrefixedCompact(SeekableByteChannel channel) throws IOException { DicBuffer buf = new DicBuffer(64 * 1024); - for (Map.Entry item: strings.entrySet()) { + for (Map.Entry item : strings.entrySet()) { Item value = item.getValue(); String sub = value.data.substring(value.start, value.end); if (buf.wontFit(sub.length() * 2)) { @@ -141,7 +157,6 @@ public int getLength() { } } - public static void main(String[] args) throws IOException { StringStorage strings = new StringStorage(); try (BufferedReader reader = Files.newBufferedReader(Paths.get(args[0]))) { @@ -157,12 +172,14 @@ public static void main(String[] args) throws IOException { strings.compile(); Path fullName = Paths.get(args[1] + ".lpf"); - try (SeekableByteChannel chan = Files.newByteChannel(fullName, StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING)) { + try (SeekableByteChannel chan = Files.newByteChannel(fullName, StandardOpenOption.CREATE, + StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING)) { strings.writeLengthPrefixedCompact(chan); } Path compactName = Paths.get(args[1] + ".cmp"); - try (SeekableByteChannel chan = Files.newByteChannel(compactName, StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING)) { + try (SeekableByteChannel chan = Files.newByteChannel(compactName, StandardOpenOption.CREATE, + StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING)) { strings.writeCompact(chan); } System.out.printf("wasted bytes=%d, slots=%d%n", strings.layout.wastedBytes(), strings.layout.numSlots()); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Unescape.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Unescape.java new file mode 100644 index 00000000..a120ad7e --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Unescape.java @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class Unescape { + private static final Pattern unicodeLiteral = Pattern.compile("\\\\u(?:[0-9a-fA-F]{4}|\\{[0-9a-fA-F]+})"); + + /** + * Resolve unicode escape sequences in the string + *

+ * Sequences are defined to be: + *

    + *
  • \\u0000-\\uFFFF: exactly four hexadecimal characters preceded by \\u
  • + *
  • \\u{...}: a correct unicode character inside brackets
  • + *
+ * + * @param text + * to to resolve sequences + * @return string with unicode escapes resolved + */ + public static String unescape(String text) { + Matcher m = unicodeLiteral.matcher(text); + if (!m.find()) { + return text; + } + + StringBuilder sb = new StringBuilder(text.length()); + int start = 0; + do { + int pos = m.start(); + int textStart = pos + 2; + int textEnd = m.end(); + if (text.charAt(textStart) == '{') { + textStart += 1; + textEnd -= 1; + } + sb.append(text, start, m.start()); + // in future use zero-copying API when using JDK 9+ + String hexCodepoint = text.substring(textStart, textEnd); + sb.appendCodePoint(Integer.parseInt(hexCodepoint, 16)); + start = m.end(); + } while (m.find()); + sb.append(text, start, text.length()); + return sb.toString(); + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBuffer.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBuffer.java index 088c2132..8721db42 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBuffer.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBuffer.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi.dictionary.build; import java.io.IOException; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java index 292fc8bb..6a44310c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi.dictionary.build; import java.io.IOException; @@ -8,11 +24,10 @@ import java.nio.channels.WritableByteChannel; public class UnicodeBufferResizeable { - private ByteBuffer buffer; + private ResizableBuffer buffer; public UnicodeBufferResizeable(int size) { - this.buffer = ByteBuffer.allocate(size); - buffer.order(ByteOrder.LITTLE_ENDIAN); + this.buffer = new ResizableBuffer(size); } public UnicodeBufferResizeable() { @@ -25,26 +40,11 @@ public void put(int offset, String data, int start, int end) { } private CharBuffer prepare(int offset, int numChars) { - buffer.position(offset * 2); - int remaining = buffer.remaining(); - int byteLength = numChars * 2; - while (remaining < byteLength) { - ByteBuffer newBuffer = ByteBuffer.allocate(buffer.capacity() * 2); - newBuffer.order(ByteOrder.LITTLE_ENDIAN); - buffer.flip(); - newBuffer.put(buffer); - buffer = newBuffer; - remaining = newBuffer.remaining(); - } - CharBuffer chars = buffer.asCharBuffer(); - buffer.position(buffer.position() + byteLength); - return chars; + ByteBuffer buf = buffer.prepare(offset * 2, numChars * 2); + return buf.asCharBuffer(); } - public void write(WritableByteChannel channel, int limit) throws IOException { - buffer.position(0); - buffer.limit(limit); - channel.write(buffer); - buffer.clear(); + public void write(WritableByteChannel channel, int start, int end) throws IOException { + buffer.write(channel, start, end); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordIdResolver.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordIdResolver.java index 54a836cf..804f8e89 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordIdResolver.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordIdResolver.java @@ -20,7 +20,9 @@ public interface WordIdResolver { int lookup(String headword, short posId, String reading); + void validate(int wordId); + boolean isUser(); default byte parseList(String data, Ints result) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordInfoLayoutFixedWidth.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordInfoLayoutFixedWidth.java index 20d3a609..4fc20db6 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordInfoLayoutFixedWidth.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordInfoLayoutFixedWidth.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi.dictionary.build; import com.worksap.nlp.sudachi.dictionary.Ints; @@ -18,7 +34,6 @@ public class WordInfoLayoutFixedWidth { private final Ints cSplits = new Ints(16); private final Ints wordStructure = new Ints(16); - public WordInfoLayoutFixedWidth(Lookup2 resolver, StringIndex index, WordRef.Parser parser) { this.lookup = resolver; this.index = index; @@ -26,24 +41,25 @@ public WordInfoLayoutFixedWidth(Lookup2 resolver, StringIndex index, WordRef.Par wordRefParser = parser; } - public void process(CsvLexicon.WordEntry entry) { + public void process(RawWordEntry entry) { } - public int put(CsvLexicon.WordEntry entry) { + public int put(RawWordEntry entry) { int position = this.position + buffer.position(); int entryPtr = position >>> 3; - buffer.putShort(entry.leftId); - buffer.putShort(entry.rightId); - buffer.putShort(entry.cost); - buffer.putShort(entry.wordInfo.getPOSId()); + ByteBuffer buf = this.buffer; + buf.putShort(entry.leftId); + buf.putShort(entry.rightId); + buf.putShort(entry.cost); + buf.putShort(entry.posId); // 8 bytes - buffer.putInt(index.resolve(entry.wordInfo.getSurface()).encode()); // surfacePtr - buffer.putInt(index.resolve(entry.wordInfo.getReadingForm()).encode()); // readingPtr - int normFormPtr = wordRefParser.parse(entry.wordInfo.getNormalizedForm()).resolve(lookup); - int dicFormPtr = wordRefParser.parse(entry.wordInfo.getDictionaryForm()).resolve(lookup); - buffer.putInt(normFormPtr); // normalized entry - buffer.putInt(dicFormPtr); // dictionary form + buf.putInt(index.resolve(entry.headword).encode()); // surfacePtr + buf.putInt(index.resolve(entry.reading).encode()); // readingPtr + int normFormPtr = wordRefParser.parse(entry.normalizedFormRef).resolve(lookup); + int dicFormPtr = wordRefParser.parse(entry.dictionaryFormRef).resolve(lookup); + buf.putInt(normFormPtr); // normalized entry + buf.putInt(dicFormPtr); // dictionary form // 8 + 16 = 24 bytes byte aSplitLen = parseList(entry.aUnitSplitString, aSplits); @@ -52,14 +68,14 @@ public int put(CsvLexicon.WordEntry entry) { byte wordStructureLen = parseList(entry.wordStructureString, wordStructure); byte synonymLen = (byte) entry.wordInfo.getSynonymGroupIds().length; - buffer.putShort(entry.surfaceUtf8Length); - buffer.put(cSplitLen); - buffer.put(bSplitLen); - buffer.put(aSplitLen); - buffer.put(wordStructureLen); - buffer.put(synonymLen); + buf.putShort(entry.surfaceUtf8Length); + buf.put(cSplitLen); + buf.put(bSplitLen); + buf.put(aSplitLen); + buf.put(wordStructureLen); + buf.put(synonymLen); int userDataLength = entry.userData.length(); - buffer.put(userDataLength != 0 ? (byte)0 : (byte)1); + buf.put(userDataLength != 0 ? (byte) 0 : (byte) 1); // 24 + 8 = 32 bytes putInts(cSplits, cSplitLen); @@ -69,16 +85,16 @@ public int put(CsvLexicon.WordEntry entry) { putInts(Ints.wrap(entry.wordInfo.getSynonymGroupIds()), synonymLen); if (userDataLength != 0) { - buffer.putShort((short) userDataLength); + buf.putShort((short) userDataLength); String userData = entry.userData; for (int i = 0; i < userDataLength; ++i) { - buffer.putShort((short)userData.charAt(i)); + buf.putShort((short) userData.charAt(i)); } } // align to 8 boundary - int currentPosition = buffer.position(); - buffer.position(Align.align(currentPosition, 8)); + int currentPosition = buf.position(); + buf.position(Align.align(currentPosition, 8)); return entryPtr; } @@ -89,16 +105,16 @@ private void putInts(Ints ints, int len) { } } - public void fillPointers(ByteBuffer data, List entries, Lookup2 lookup) { + public void fillPointers(ByteBuffer data, List entries, Lookup2 lookup) { for (int i = 0; i < entries.size(); i++) { - CsvLexicon.WordEntry entry = entries.get(i); + RawWordEntry entry = entries.get(i); int offset = entry.pointer << 3; data.position(offset + 8); data.putInt(index.resolve(entry.wordInfo.getSurface()).encode()); data.putInt(index.resolve(entry.wordInfo.getReadingForm()).encode()); data.putInt(entry.wordInfo.getDictionaryFormWordId()); - //data.putInt(entry.) + // data.putInt(entry.) } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java index 23a8e3d0..440d2993 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi.dictionary.build; import com.worksap.nlp.sudachi.dictionary.StringPtr; @@ -10,20 +26,23 @@ /** *

- * Lays out dictionary words so that they will form correct {@link StringPtr} instances. - * That means taking into account the required alignment for strings with larger sizes. - * Aligning strings produces wasted space in form of padding, which is kept track with free list approach. + * Lays out dictionary words so that they will form correct {@link StringPtr} + * instances. That means taking into account the required alignment for strings + * with larger sizes. Aligning strings produces wasted space in form of padding, + * which is kept track with free list approach. * *

- * The main API is {@link #add(String)} method which should be called for all strings. - * The method should be called for strings sorted in descending order by length, otherwise padding between aligned strings - * would not be utilized correctly. - * The returned {@link StringPtr}s will be correct in any case. + * The main API is {@link #add(String)} method which should be called for all + * strings. The method should be called for strings sorted in descending order + * by length, otherwise padding between aligned strings would not be utilized + * correctly. The returned {@link StringPtr}s will be correct in any case. * *

- * The current implementation is relatively fast, but can be made even faster by using sorted multiset collection. - * JVM standard library does not have one, so the current implementation sorts free list while keeping track if the sort - * is needed and guarding against relatively expensive checking free lists with additional conditions. + * The current implementation is relatively fast, but can be made even faster by + * using sorted multiset collection. JVM standard library does not have one, so + * the current implementation sorts free list while keeping track if the sort is + * needed and guarding against relatively expensive checking free lists with + * additional conditions. */ public class WordLayout { private final UnicodeBufferResizeable buffer = new UnicodeBufferResizeable(); @@ -45,16 +64,21 @@ public StringPtr add(String string, int start, int end) { } /** - * Allocates a segment of utf-16 code units in a specified block, taking in account requested alignment. + * Allocates a segment of utf-16 code units in a specified block, taking in + * account requested alignment. * - * Alignment can skip some space in the beginning of the block because of padding. - * That space will be placed into free lists. - * Nothing will be placed in the free lists if the allocation is not possible. + * Alignment can skip some space in the beginning of the block because of + * padding. That space will be placed into free lists. Nothing will be placed in + * the free lists if the allocation is not possible. * - * @param length requested length of segment - * @param alignment requested alignment of segment - * @param start start of the block of memory to use - * @param end end of the block of memory to use + * @param length + * requested length of segment + * @param alignment + * requested alignment of segment + * @param start + * start of the block of memory to use + * @param end + * end of the block of memory to use * @return offset of the aligned data or -1 if allocation is impossible */ private int allocateInBlock(int length, int alignment, int start, int end) { @@ -83,15 +107,19 @@ private int allocateInBlock(int length, int alignment, int start, int end) { /** * Allocates a slot of {@code length} bytes, alignment with {@code alignment}. - * It first considers free slots created by previous allocations, if none is valid. + * It first considers free slots created by previous allocations, if none is + * valid. * - * Current implementation is prone to creating "holes" of 1-length, which are almost impossible to fill - * from the usual dictionaries. - * Most emoji take 2 code units and words which are not substrings of another word are usually longer. - * The current implementation wastes ~32k holes in ~42M dictionary, which is ~0.1% of total space. + * Current implementation is prone to creating "holes" of 1-length, which are + * almost impossible to fill from the usual dictionaries. Most emoji take 2 code + * units and words which are not substrings of another word are usually longer. + * The current implementation wastes ~32k holes in ~42M dictionary, which is + * ~0.1% of total space. * - * @param length number of byte - * @param alignment requested alignment + * @param length + * number of byte + * @param alignment + * requested alignment * @return offset in utf-16 code units to the location of the requested block */ private int allocate(int length, int alignment) { @@ -119,7 +147,8 @@ private int allocate(int length, int alignment) { fs.start = start + length; fs.length = remaining; freeDirty = true; - // we need to recompute maxLength only if modifying the last (maximum) element in free lists + // we need to recompute maxLength only if modifying the last (maximum) element + // in free lists if (i == numFree - 1) { maxLength = computeNewMaxLength(i); } @@ -133,7 +162,6 @@ private int allocate(int length, int alignment) { maxLength = Math.max(0, maxLength - 1); } - int alignedStart = allocateInBlock(length, alignment, pointer, Integer.MAX_VALUE); assert alignedStart != -1; pointer = alignedStart + length; @@ -142,7 +170,9 @@ private int allocate(int length, int alignment) { /** * Returns available max length for a hole - * @param length hole length + * + * @param length + * hole length * @return length of an element which can be allocated using any alignment */ private static int availableMaxLength(int length) { @@ -156,9 +186,11 @@ private static int availableMaxLength(int length) { } /** - * Compute new maximum length which can be handled by free lists. - * Should be called if the last element of free lists was updated. - * @param index index in free lists of the element which needs to be considered + * Compute new maximum length which can be handled by free lists. Should be + * called if the last element of free lists was updated. + * + * @param index + * index in free lists of the element which needs to be considered * @return new maximum length that can be handled by free lists */ private int computeNewMaxLength(int index) { @@ -182,7 +214,7 @@ private int computeNewMaxLength(int index) { } public void write(WritableByteChannel channel) throws IOException { - buffer.write(channel, pointer * 2); + buffer.write(channel, 0, pointer * 2); } public static class FreeSpace implements Comparable { @@ -197,16 +229,15 @@ public FreeSpace(int start, int length) { @Override public int compareTo(FreeSpace o) { int comparison = Integer.compare(length, o.length); - if (comparison != 0) return comparison; + if (comparison != 0) + return comparison; return Integer.compare(start, o.start); } @Override public String toString() { - return new StringJoiner(", ", FreeSpace.class.getSimpleName() + "[", "]") - .add("start=" + start) - .add("length=" + length) - .toString(); + return new StringJoiner(", ", FreeSpace.class.getSimpleName() + "[", "]").add("start=" + start) + .add("length=" + length).toString(); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLookup.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLookup.java index 75c3871a..749eabba 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLookup.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLookup.java @@ -51,9 +51,9 @@ public Csv(CsvLexicon lexicon) { @Override public int lookup(String headword, short posId, String reading) { - List entries = lexicon.getEntries(); + List entries = lexicon.getEntries(); for (int i = 0; i < entries.size(); ++i) { - CsvLexicon.WordEntry entry = entries.get(i); + RawWordEntry entry = entries.get(i); if (entry.wordInfo.getSurface().equals(headword) && entry.wordInfo.getPOSId() == posId && entry.wordInfo.getReadingForm().equals(reading)) { return i; @@ -67,7 +67,7 @@ public void validate(int wordId) { if (wordId < 0) { throw new IllegalArgumentException("wordId can't be negative, was " + wordId); } - List entries = lexicon.getEntries(); + List entries = lexicon.getEntries(); if (wordId >= entries.size()) { throw new IllegalArgumentException(String .format("wordId %d was larger than number of dictionary entries (%d)", wordId, entries.size())); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java index 6d5304a3..f0aa8466 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi.dictionary.build; import com.worksap.nlp.sudachi.StringUtil; @@ -7,8 +23,6 @@ import java.util.List; import java.util.regex.Pattern; -import static com.worksap.nlp.sudachi.dictionary.build.CsvLexicon.unescape; - /** * Reference to a word in the CSV dictionary. */ @@ -31,6 +45,7 @@ public int resolve(Lookup2 resolver) { return resolver.byIndex(line).pointer(); } } + public static final class Headword extends WordRef { private final String headword; @@ -75,7 +90,7 @@ public String getReading() { @Override public int resolve(Lookup2 resolver) { List entries = resolver.byHeadword(headword); - for (Lookup2.Entry entry: entries) { + for (Lookup2.Entry entry : entries) { if (entry.matches(posId, reading)) { return entry.pointer(); } @@ -88,27 +103,43 @@ public int resolve(Lookup2 resolver) { public static class Parser { private final POSTable posTable; + private boolean strict; - public Parser(POSTable posTable) { + public Parser(POSTable posTable, boolean strict) { this.posTable = posTable; + this.strict = strict; } public WordRef parse(String text) { if (NUMERIC_RE.matcher(text).matches()) { - int offset = text.charAt(0) == 'U' ? 1: 0; + if (strict) { + throw new CsvFieldException(String.format( + "invalid word reference: %s, numeric references are not supported in modern csv formats", + text)); + } + int offset = text.charAt(0) == 'U' ? 1 : 0; int lineNum = Integer.parseInt(text.substring(offset)); return new LineNo(lineNum); } if (StringUtil.count(text, ',') == 7) { String[] cols = text.split(",", 8); - String headword = unescape(cols[0]); - POS pos = new POS(Arrays.copyOfRange(cols, 1, 7)); + String headword = Unescape.unescape(cols[0]); + String[] posElems = Arrays.copyOfRange(cols, 1, 7); + for (int i = 0; i < POS.DEPTH; ++i) { + posElems[i] = Unescape.unescape(posElems[i]); + } + POS pos = new POS(posElems); short posId = posTable.getId(pos); - String reading = unescape(cols[7]); + String reading = Unescape.unescape(cols[7]); return new Triple(headword, posId, reading); } + if (strict) { + throw new CsvFieldException( + String.format("invalid word reference: %s, it must contain POS tag and reading", text)); + } + return new Headword(text); } diff --git a/src/test/java/com/worksap/nlp/sudachi/TestDictionary.kt b/src/test/java/com/worksap/nlp/sudachi/TestDictionary.kt index d17007d1..0eca2d0a 100644 --- a/src/test/java/com/worksap/nlp/sudachi/TestDictionary.kt +++ b/src/test/java/com/worksap/nlp/sudachi/TestDictionary.kt @@ -19,7 +19,6 @@ package com.worksap.nlp.sudachi import com.worksap.nlp.sudachi.dictionary.BinaryDictionary import com.worksap.nlp.sudachi.dictionary.build.DicBuilder import com.worksap.nlp.sudachi.dictionary.build.MemChannel -import com.worksap.nlp.sudachi.dictionary.build.res /** Utility for lazily creating binary dictionaries for test */ object TestDictionary { diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/StringPtrTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/StringPtrTest.kt index 6e34f1d4..bda91a39 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/StringPtrTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/StringPtrTest.kt @@ -1,91 +1,106 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi.dictionary -import org.junit.Test import kotlin.test.assertEquals import kotlin.test.assertFalse import kotlin.test.assertTrue - +import org.junit.Test class StringPtrTest { - @Test - fun additionalBits() { - assertEquals(0, StringPtr.unsafe(0, 0).additionalBits()) - assertEquals(2, StringPtr.unsafe(22, 0).additionalBits()) - } + @Test + fun additionalBits() { + assertEquals(0, StringPtr.unsafe(0, 0).additionalBits()) + assertEquals(2, StringPtr.unsafe(22, 0).additionalBits()) + } - @Test - fun lengthEncode() { - assertEquals(0, StringPtr.unsafe(0, 0).encode()) - assertEquals(0b00001000_00000000_00000000_00000000, StringPtr.unsafe(1, 0).encode()) - } + @Test + fun lengthEncode() { + assertEquals(0, StringPtr.unsafe(0, 0).encode()) + assertEquals(0b00001000_00000000_00000000_00000000, StringPtr.unsafe(1, 0).encode()) + } - @Test - fun decodeMaxLength() { - val encoded = 0b11111111_11111111_00000000_00000000 - val decoded = StringPtr.decode(encoded.toInt()) - assertEquals(StringPtr.MAX_LENGTH, decoded.length) - } + @Test + fun decodeMaxLength() { + val encoded = 0b11111111_11111111_00000000_00000000 + val decoded = StringPtr.decode(encoded.toInt()) + assertEquals(StringPtr.MAX_LENGTH, decoded.length) + } - @Test - fun encodeMaxLength() { - val decoded = StringPtr.unsafe(StringPtr.MAX_LENGTH, 0) - val encoded = 0b11111111_11111111_00000000_00000000 - assertEquals(encoded.toInt(), decoded.encode()) - } + @Test + fun encodeMaxLength() { + val decoded = StringPtr.unsafe(StringPtr.MAX_LENGTH, 0) + val encoded = 0b11111111_11111111_00000000_00000000 + assertEquals(encoded.toInt(), decoded.encode()) + } - private fun checkConversion(length: Int, offset: Int) { - val original = StringPtr.unsafe(length, offset) - val encoded = original.encode() - val decoded = StringPtr.decode(encoded) - assertEquals(original, decoded, "conversion failed, encoded value = %08x".format(encoded)) - } + private fun checkConversion(length: Int, offset: Int) { + val original = StringPtr.unsafe(length, offset) + val encoded = original.encode() + val decoded = StringPtr.decode(encoded) + assertEquals(original, decoded, "conversion failed, encoded value = %08x".format(encoded)) + } - @Test - fun decodeEncodeMaxSimple() { - checkConversion(19, 0x07ff_ffff) - } + @Test + fun decodeEncodeMaxSimple() { + checkConversion(19, 0x07ff_ffff) + } - @Test - fun decodeEncodeSimple() { - checkConversion(5, 10) - checkConversion(1, 10) - checkConversion(19, 10) - } + @Test + fun decodeEncodeSimple() { + checkConversion(5, 10) + checkConversion(1, 10) + checkConversion(19, 10) + } - @Test - fun decodeEncodeAddLength() { - // low offset bits must be aligned for large lengths - checkConversion(19 + 0b00000000_000000001, 0x07ff_ffff xor ((1 shl 0) - 1)) - checkConversion(19 + 0b00000000_000000011, 0x07ff_ffff xor ((1 shl 1) - 1)) - checkConversion(19 + 0b00000000_000000111, 0x07ff_ffff xor ((1 shl 2) - 1)) - checkConversion(19 + 0b00000000_000001111, 0x07ff_ffff xor ((1 shl 3) - 1)) - checkConversion(19 + 0b00000000_000011111, 0x07ff_ffff xor ((1 shl 4) - 1)) - checkConversion(19 + 0b00000000_000111111, 0x07ff_ffff xor ((1 shl 5) - 1)) - checkConversion(19 + 0b00000000_001111111, 0x07ff_ffff xor ((1 shl 6) - 1)) - checkConversion(19 + 0b00000000_011111111, 0x07ff_ffff xor ((1 shl 7) - 1)) - checkConversion(19 + 0b00000000_111111111, 0x07ff_ffff xor ((1 shl 8) - 1)) - checkConversion(19 + 0b00000001_111111111, 0x07ff_ffff xor ((1 shl 9) - 1)) - checkConversion(19 + 0b00000011_111111111, 0x07ff_ffff xor ((1 shl 10) - 1)) - checkConversion(19 + 0b00000111_111111111, 0x07ff_ffff xor ((1 shl 11) - 1)) - } + @Test + fun decodeEncodeAddLength() { + // low offset bits must be aligned for large lengths + checkConversion(19 + 0b00000000_000000001, 0x07ff_ffff xor ((1 shl 0) - 1)) + checkConversion(19 + 0b00000000_000000011, 0x07ff_ffff xor ((1 shl 1) - 1)) + checkConversion(19 + 0b00000000_000000111, 0x07ff_ffff xor ((1 shl 2) - 1)) + checkConversion(19 + 0b00000000_000001111, 0x07ff_ffff xor ((1 shl 3) - 1)) + checkConversion(19 + 0b00000000_000011111, 0x07ff_ffff xor ((1 shl 4) - 1)) + checkConversion(19 + 0b00000000_000111111, 0x07ff_ffff xor ((1 shl 5) - 1)) + checkConversion(19 + 0b00000000_001111111, 0x07ff_ffff xor ((1 shl 6) - 1)) + checkConversion(19 + 0b00000000_011111111, 0x07ff_ffff xor ((1 shl 7) - 1)) + checkConversion(19 + 0b00000000_111111111, 0x07ff_ffff xor ((1 shl 8) - 1)) + checkConversion(19 + 0b00000001_111111111, 0x07ff_ffff xor ((1 shl 9) - 1)) + checkConversion(19 + 0b00000011_111111111, 0x07ff_ffff xor ((1 shl 10) - 1)) + checkConversion(19 + 0b00000111_111111111, 0x07ff_ffff xor ((1 shl 11) - 1)) + } - @Test - fun isValid() { - assertTrue { StringPtr.isValid(0, 0) } - assertTrue { StringPtr.isValid(1, 0) } - assertTrue { StringPtr.isValid(0, 1) } - assertTrue { StringPtr.isValid(1, 1) } - assertTrue { StringPtr.isValid(0, 19) } - assertTrue { StringPtr.isValid(1, 19) } - assertTrue { StringPtr.isValid(0, 20) } - assertTrue { StringPtr.isValid(1, 20) } - assertTrue { StringPtr.isValid(0, 21) } - assertFalse { StringPtr.isValid(1, 21) } - assertTrue { StringPtr.isValid(2, 21) } - assertTrue { StringPtr.isValid(0, 23) } - assertFalse { StringPtr.isValid(1, 23) } - assertFalse { StringPtr.isValid(2, 23) } - assertTrue { StringPtr.isValid(4, 23) } - } -} \ No newline at end of file + @Test + fun isValid() { + assertTrue { StringPtr.isValid(0, 0) } + assertTrue { StringPtr.isValid(1, 0) } + assertTrue { StringPtr.isValid(0, 1) } + assertTrue { StringPtr.isValid(1, 1) } + assertTrue { StringPtr.isValid(0, 19) } + assertTrue { StringPtr.isValid(1, 19) } + assertTrue { StringPtr.isValid(0, 20) } + assertTrue { StringPtr.isValid(1, 20) } + assertTrue { StringPtr.isValid(0, 21) } + assertFalse { StringPtr.isValid(1, 21) } + assertTrue { StringPtr.isValid(2, 21) } + assertTrue { StringPtr.isValid(0, 23) } + assertFalse { StringPtr.isValid(1, 23) } + assertFalse { StringPtr.isValid(2, 23) } + assertTrue { StringPtr.isValid(4, 23) } + } +} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/AlignTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/AlignTest.kt new file mode 100644 index 00000000..84c0d8d2 --- /dev/null +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/AlignTest.kt @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build + +import kotlin.test.Test +import kotlin.test.assertEquals + +class AlignTest { + @Test + fun works8() { + assertEquals(0, Align.align(0, 8)) + assertEquals(8, Align.align(1, 8)) + assertEquals(8, Align.align(2, 8)) + assertEquals(8, Align.align(3, 8)) + assertEquals(8, Align.align(4, 8)) + assertEquals(8, Align.align(5, 8)) + assertEquals(8, Align.align(6, 8)) + assertEquals(8, Align.align(7, 8)) + assertEquals(8, Align.align(8, 8)) + assertEquals(16, Align.align(9, 8)) + } + + @Test + fun works16() { + assertEquals(0, Align.align(0, 16)) + assertEquals(16, Align.align(1, 16)) + assertEquals(16, Align.align(2, 16)) + assertEquals(16, Align.align(3, 16)) + assertEquals(16, Align.align(4, 16)) + assertEquals(16, Align.align(5, 16)) + assertEquals(16, Align.align(6, 16)) + assertEquals(16, Align.align(7, 16)) + assertEquals(16, Align.align(8, 16)) + assertEquals(16, Align.align(9, 16)) + assertEquals(16, Align.align(16, 16)) + assertEquals(32, Align.align(17, 16)) + } +} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexiconTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexiconTest.kt index 77a2e29f..bcf24418 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexiconTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexiconTest.kt @@ -102,17 +102,19 @@ class CsvLexiconTest { @Test fun unescape() { - assertEquals("test", CsvLexicon.unescape("""test""")) - assertEquals("\u0000", CsvLexicon.unescape("""\u0000""")) - assertEquals("あ", CsvLexicon.unescape("""\u3042""")) - assertEquals("あ5", CsvLexicon.unescape("""\u30425""")) - assertEquals("💕", CsvLexicon.unescape("""\u{1f495}""")) - assertEquals("\udbff\udfff", CsvLexicon.unescape("""\u{10ffff}""")) + assertEquals("test", Unescape.unescape("""test""")) + assertEquals("\u0000", Unescape.unescape("""\u0000""")) + assertEquals("a\u0000a", Unescape.unescape("""a\u0000a""")) + assertEquals("あ", Unescape.unescape("""\u3042""")) + assertEquals("あ5", Unescape.unescape("""\u30425""")) + assertEquals("💕", Unescape.unescape("""\u{1f495}""")) + assertEquals("a💕x", Unescape.unescape("""a\u{1f495}x""")) + assertEquals("\udbff\udfff", Unescape.unescape("""\u{10ffff}""")) } @Test fun unescapeFails() { - assertFails { CsvLexicon.unescape("""\u{FFFFFF}""") } - assertFails { CsvLexicon.unescape("""\u{110000}""") } // 0x10ffff is the largest codepoint + assertFails { Unescape.unescape("""\u{FFFFFF}""") } + assertFails { Unescape.unescape("""\u{110000}""") } // 0x10ffff is the largest codepoint } } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt new file mode 100644 index 00000000..814ca509 --- /dev/null +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build + +import com.worksap.nlp.sudachi.cps +import com.worksap.nlp.sudachi.dictionary.CSVParser +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertNotNull +import kotlin.test.assertNull + +class RawLexiconReaderTest { + companion object { + fun csv(name: String): CSVParser { + val stream = cps(name) + return CSVParser(stream.reader()) + } + } + + @Test + fun legacyCsvWithMinimumFields() { + val reader = RawLexiconReader(csv("legacy-minimum.csv"), POSTable()) + assertNotNull(reader.nextEntry()).let { e -> + assertEquals("東京都", e.headword) + assertEquals("トウキョウト", e.reading) + assertEquals("5/9", e.wordStructureString) + assertEquals("", e.cUnitSplitString) + assertEquals("", e.userData) + } + assertNull(reader.nextEntry()) + } + + @Test + fun legacyCsvWithAllFields() { + val reader = RawLexiconReader(csv("legacy-full.csv"), POSTable()) + assertNotNull(reader.nextEntry()).let { e -> + assertEquals("東京都", e.headword) + assertEquals("トウキョウト", e.reading) + assertEquals("5/9", e.wordStructureString) + assertEquals("8/9", e.cUnitSplitString) + assertEquals("10", e.userData) + } + assertNull(reader.nextEntry()) + } + + @Test fun headerCsvMinimumFields() {} + + @Test + fun headerCsvAllFields() { + val reader = RawLexiconReader(csv("headers-all.csv"), POSTable()) + assertNotNull(reader.nextEntry()).let { e -> + assertEquals("東京都", e.headword) + assertEquals("トウキョウト", e.reading) + assertEquals("5/9", e.aUnitSplitString) + assertEquals("5/10", e.bUnitSplitString) + assertEquals("5/11", e.cUnitSplitString) + assertEquals("6/7", e.wordStructureString) + assertEquals("10", e.userData) + } + assertNull(reader.nextEntry()) + } +} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringStorageTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringStorageTest.kt index e7ef37b2..1a5abcf0 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringStorageTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringStorageTest.kt @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi.dictionary.build import kotlin.test.Test @@ -5,29 +21,29 @@ import kotlin.test.assertEquals class StringStorageTest { - @Test - fun simple() { - val strs = StringStorage() - strs.add("test") - strs.add("es") - strs.compile() - val data = strs.strings; - assertEquals(2, data.size) - assertEquals(1, data["es"]?.start) - assertEquals(3, data["es"]?.end) - } + @Test + fun simple() { + val strs = StringStorage() + strs.add("test") + strs.add("es") + strs.compile() + val data = strs.strings + assertEquals(2, data.size) + assertEquals(1, data["es"]?.start) + assertEquals(3, data["es"]?.end) + } - @Test - fun oneChar() { - val strs = StringStorage() - strs.add("x") - strs.add("y") - strs.compile() - val data = strs.strings - assertEquals(2, data.size) - assertEquals(0, data["x"]?.start) - assertEquals(1, data["x"]?.end) - assertEquals(0, data["y"]?.start) - assertEquals(1, data["y"]?.end) - } -} \ No newline at end of file + @Test + fun oneChar() { + val strs = StringStorage() + strs.add("x") + strs.add("y") + strs.compile() + val data = strs.strings + assertEquals(2, data.size) + assertEquals(0, data["x"]?.start) + assertEquals(1, data["x"]?.end) + assertEquals(0, data["y"]?.start) + assertEquals(1, data["y"]?.end) + } +} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/SystemDicTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/SystemDicTest.kt index 096eaef0..dfe5ec35 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/SystemDicTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/SystemDicTest.kt @@ -18,6 +18,7 @@ package com.worksap.nlp.sudachi.dictionary.build import com.worksap.nlp.sudachi.dictionary.BinaryDictionary import com.worksap.nlp.sudachi.dictionary.POS +import com.worksap.nlp.sudachi.res import kotlin.test.* class SystemDicTest { diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt index 979d66d6..d2918077 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt @@ -16,10 +16,7 @@ package com.worksap.nlp.sudachi.dictionary.build -import com.worksap.nlp.sudachi.Config -import com.worksap.nlp.sudachi.Dictionary -import com.worksap.nlp.sudachi.DictionaryFactory -import com.worksap.nlp.sudachi.WordId +import com.worksap.nlp.sudachi.* import com.worksap.nlp.sudachi.dictionary.BinaryDictionary import com.worksap.nlp.sudachi.dictionary.DictionaryAccess import com.worksap.nlp.sudachi.dictionary.POS @@ -29,10 +26,6 @@ import kotlin.test.assertContentEquals import kotlin.test.assertEquals import kotlin.test.assertFails -fun T.res(name: String): URL { - return javaClass.getResource(name) ?: throw IllegalArgumentException("$name was not found") -} - class TestDic { private var matrixUrl: URL = res("test.matrix") private lateinit var systemDic: BinaryDictionary diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/WordLayoutTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/WordLayoutTest.kt index 1cf108fa..f95a6be4 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/WordLayoutTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/WordLayoutTest.kt @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi.dictionary.build import com.worksap.nlp.sudachi.dictionary.StringPtr @@ -5,95 +21,96 @@ import java.nio.CharBuffer import kotlin.test.Test import kotlin.test.assertEquals import kotlin.test.assertNotNull +import kotlin.test.assertTrue class WordLayoutTest { - companion object { - fun CharBuffer.read(ptr: StringPtr): String { - return substring(ptr.offset, ptr.offset + ptr.length) - } + companion object { + fun CharBuffer.read(ptr: StringPtr): String { + return substring(ptr.offset, ptr.offset + ptr.length) } + } + @Test + fun alignmentBasedPlacement() { + val layout = WordLayout() + val p1 = layout.add("0".repeat(25)) + val p2 = layout.add("1".repeat(23)) + val p3 = layout.add("2".repeat(15)) + val p4 = layout.add("3".repeat(4)) + val p5 = layout.add("4".repeat(1)) + val p6 = layout.add("5".repeat(2)) + val chan = InMemoryChannel() + layout.write(chan) + val chars = chan.buffer().asCharBuffer() + assertEquals("0".repeat(25), chars.read(p1)) + assertEquals("1".repeat(23), chars.read(p2)) + assertEquals("2".repeat(15), chars.read(p3)) + assertEquals("3".repeat(4), chars.read(p4)) + assertEquals("4".repeat(1), chars.read(p5)) + assertEquals("5".repeat(2), chars.read(p6)) + // last two should be placed between 0-s and 1-s + assertTrue(p5.offset < p2.offset) + assertTrue(p6.offset < p2.offset) + } - @Test - fun alignmentBasedPlacement() { - val layout = WordLayout() - val p1 = layout.add("0".repeat(25)) - val p2 = layout.add("1".repeat(23)) - val p3 = layout.add("2".repeat(15)) - val p4 = layout.add("3".repeat(3)) - val p5 = layout.add("4".repeat(1)) - val p6 = layout.add("5".repeat(2)) - val chan = InMemoryChannel() - layout.write(chan) - val chars = chan.buffer().asCharBuffer() - assertEquals("0".repeat(25), chars.read(p1)) - assertEquals("1".repeat(23), chars.read(p2)) - assertEquals("2".repeat(15), chars.read(p3)) - assertEquals("3".repeat(3), chars.read(p4)) - assertEquals("4".repeat(1), chars.read(p5)) - assertEquals("5".repeat(2), chars.read(p6)) - assert(p5.offset < p2.offset) - assert(p6.offset < p2.offset) + @Test + fun alignmentPlacedPlacementLarge() { + val layout = WordLayout() + val ptrs = ArrayList() + for (i in 0..499) { + val char = 500 - i + val str = char.toChar().toString().repeat(char) + ptrs.add(layout.add(str)) } - - @Test - fun alignmentPlacedPlacementLarge() { - val layout = WordLayout() - val ptrs = ArrayList() - for (i in 0..499) { - val char = 500 - i - val str = char.toChar().toString().repeat(char) - ptrs.add(layout.add(str)) - } - val chan = InMemoryChannel() - layout.write(chan) - val chars = chan.buffer().asCharBuffer() - for (i in 0..499) { - val char = 500 - i - val expected = char.toChar().toString().repeat(char) - val actual = chars.read(ptrs[i]) - assertEquals(expected, actual) - } + val chan = InMemoryChannel() + layout.write(chan) + val chars = chan.buffer().asCharBuffer() + for (i in 0..499) { + val char = 500 - i + val expected = char.toChar().toString().repeat(char) + val actual = chars.read(ptrs[i]) + assertEquals(expected, actual) } + } - @Test - fun alignmentPlacedPlacementHoles() { - val layout = WordLayout() - val ptrs = ArrayList() - for (i in 0..3) { - val count = 200 - 5 * i - val str = i.toChar().toString().repeat(count) - ptrs.add(layout.add(str)) - } - for (i in 0..20) { - val count = 21 - i - val str = (20 + i).toChar().toString().repeat(count) - ptrs.add(layout.add(str)) - } - val chan = InMemoryChannel() - layout.write(chan) - val chars = chan.buffer().asCharBuffer() - for (i in 0..3) { - val count = 200 - 5 * i - val char = i.toChar() - val expected = char.toString().repeat(count) - val actual = chars.read(ptrs[i]) - assertEquals(expected, actual) - } - for (i in 0..20) { - val count = 21 - i - val char = (20 + i).toChar() - val expected = char.toString().repeat(count) - val actual = chars.read(ptrs[4 + i]) - assertEquals(expected, actual) - } + @Test + fun alignmentPlacedPlacementHoles() { + val layout = WordLayout() + val ptrs = ArrayList() + for (i in 0..3) { + val count = 200 - 5 * i + val str = i.toChar().toString().repeat(count) + ptrs.add(layout.add(str)) } - - @Test - fun coverage() { - val layout = WordLayout() - assertEquals(0, layout.wastedBytes()) - assertEquals(0, layout.numSlots()) - assertNotNull(layout.toString()) + for (i in 0..20) { + val count = 21 - i + val str = (20 + i).toChar().toString().repeat(count) + ptrs.add(layout.add(str)) } -} \ No newline at end of file + val chan = InMemoryChannel() + layout.write(chan) + val chars = chan.buffer().asCharBuffer() + for (i in 0..3) { + val count = 200 - 5 * i + val char = i.toChar() + val expected = char.toString().repeat(count) + val actual = chars.read(ptrs[i]) + assertEquals(expected, actual) + } + for (i in 0..20) { + val count = 21 - i + val char = (20 + i).toChar() + val expected = char.toString().repeat(count) + val actual = chars.read(ptrs[4 + i]) + assertEquals(expected, actual) + } + } + + @Test + fun coverage() { + val layout = WordLayout() + assertEquals(0, layout.wastedBytes()) + assertEquals(0, layout.numSlots()) + assertNotNull(layout.toString()) + } +} diff --git a/src/test/java/com/worksap/nlp/sudachi/resources.kt b/src/test/java/com/worksap/nlp/sudachi/resources.kt new file mode 100644 index 00000000..4206e9ed --- /dev/null +++ b/src/test/java/com/worksap/nlp/sudachi/resources.kt @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi + +import java.io.InputStream +import java.net.URL + +fun T.res(name: String): URL { + return javaClass.getResource(name) ?: throw IllegalArgumentException("$name was not found") +} + +fun T.cps(name: String): InputStream { + return res(name).openStream() +} diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv new file mode 100644 index 00000000..2d38ebef --- /dev/null +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv @@ -0,0 +1,2 @@ +Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,mode,splita,splitb,splitc,wordstructure,synonymgroups,userdata +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,5/10,5/11,6/7,8/9,10 \ No newline at end of file diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/legacy-full.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/legacy-full.csv new file mode 100644 index 00000000..98cc00c5 --- /dev/null +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/legacy-full.csv @@ -0,0 +1 @@ +東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,*,5/9,6/7,8/9,10 \ No newline at end of file diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/legacy-minimum.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/legacy-minimum.csv new file mode 100644 index 00000000..8ee89d59 --- /dev/null +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/legacy-minimum.csv @@ -0,0 +1 @@ +東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,*,5/9,* \ No newline at end of file From 9692c58d719b02a5cd445334f5e6f611bd050abe Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Mon, 5 Sep 2022 13:37:03 +0900 Subject: [PATCH 09/94] add computation of utf8 length without allocations --- .../com/worksap/nlp/sudachi/StringUtil.java | 37 +++++++++++++++++ .../com/worksap/nlp/sudachi/StringUtilTest.kt | 41 +++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/src/main/java/com/worksap/nlp/sudachi/StringUtil.java b/src/main/java/com/worksap/nlp/sudachi/StringUtil.java index afef3b88..da39bbf2 100644 --- a/src/main/java/com/worksap/nlp/sudachi/StringUtil.java +++ b/src/main/java/com/worksap/nlp/sudachi/StringUtil.java @@ -115,4 +115,41 @@ public static String readLengthPrefixed(ByteBuffer buffer) { buffer.limit(limit); return result; } + + public static int countUtf8Bytes(CharSequence seq) { + return countUtf8Bytes(seq, 0, seq.length()); + } + + public static int countUtf8Bytes(CharSequence seq, int start, int end) { + if (start < 0) { + throw new IllegalArgumentException("start < 0, was " + start); + } + if (start > seq.length()) { + throw new IllegalArgumentException(String.format("start > length(): %d length()=%d", start, seq.length())); + } + if (end > seq.length()) { + throw new IllegalArgumentException(String.format("end > length(): %d length()=%d", start, seq.length())); + } + + int result = 0; + for (int i = start; i < end;) { + int cpt = Character.codePointAt(seq, i); + result += utf8Length(cpt); + i += Character.charCount(cpt); + } + return result; + } + + private static int utf8Length(int codepoint) { + // https://en.wikipedia.org/wiki/UTF-8#Encoding + if (codepoint < 0x80) { + return 1; + } else if (codepoint < 0x800) { + return 2; + } else if (codepoint < 0x10000) { + return 3; + } else { + return 4; + } + } } diff --git a/src/test/java/com/worksap/nlp/sudachi/StringUtilTest.kt b/src/test/java/com/worksap/nlp/sudachi/StringUtilTest.kt index f90190ed..8c05fa72 100644 --- a/src/test/java/com/worksap/nlp/sudachi/StringUtilTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/StringUtilTest.kt @@ -16,9 +16,11 @@ package com.worksap.nlp.sudachi +import kotlin.random.Random import kotlin.test.Test import kotlin.test.assertContentEquals import kotlin.test.assertEquals +import kotlin.test.assertFailsWith class StringUtilTest { @Test @@ -32,4 +34,43 @@ class StringUtilTest { buf.get(arr2) assertContentEquals(bytes, arr2) } + + @Test + fun countUtf8Bytes() { + assertEquals(0, StringUtil.countUtf8Bytes("")) + assertEquals(4, StringUtil.countUtf8Bytes("test")) + assertEquals(12, StringUtil.countUtf8Bytes("привет")) + assertEquals(9, StringUtil.countUtf8Bytes("東京都")) + assertEquals(4, StringUtil.countUtf8Bytes("💞")) + assertEquals(13, StringUtil.countUtf8Bytes("東京💞都")) + assertEquals(17, StringUtil.countUtf8Bytes("t東e京s💞t都")) + // https://emojipedia.org/family-man-woman-girl-boy/ + assertEquals(25, StringUtil.countUtf8Bytes("""👨‍👩‍👧‍👦""")) + } + @Test + fun countUtf8BytesRandomInput() { + for (iter in 1..1000) { + val r = Random(5) + val len = r.nextInt(iter) + val str = + generateSequence { r.nextInt(0x15000) } + .filterNot { Character.isBmpCodePoint(it) && Character.isSurrogate(it.toChar()) } + .take(len) + .fold(StringBuilder()) { s, i -> s.appendCodePoint(i) } + .toString() + val expected = str.toByteArray().size + assertEquals( + expected, + StringUtil.countUtf8Bytes(str, 0, str.length), + "failed to count utf8 bytes for iter=$iter, [$str]") + } + } + + @Test + fun invalidParamters() { + assertFailsWith { StringUtil.countUtf8Bytes("", -1, 0) } + assertFailsWith { StringUtil.countUtf8Bytes("", 0, 1) } + assertFailsWith { StringUtil.countUtf8Bytes("test", 0, 6) } + assertFailsWith { StringUtil.countUtf8Bytes("test", 6, 0) } + } } From db3f5c4e521ff8b83349bc884d4693469a23e028 Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Mon, 5 Sep 2022 13:45:56 +0900 Subject: [PATCH 10/94] fix line break in when reading POS --- .../nlp/sudachi/dictionary/build/RawLexiconReader.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index 7fa238e9..2cb5aed1 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -136,7 +136,9 @@ private RawWordEntry convertEntry(List data) { entry.dictionaryFormRef = get(data, Column.DictionaryForm, false); entry.normalizedFormRef = get(data, Column.NormalizedForm, false); - POS pos = new POS(get(data, Column.Pos1, true), get(data, Column.Pos2, true), get(data, Column.Pos3, true), + POS pos = new POS( + // comment for line break + get(data, Column.Pos1, true), get(data, Column.Pos2, true), get(data, Column.Pos3, true), get(data, Column.Pos4, true), get(data, Column.Pos5, true), get(data, Column.Pos6, true)); entry.posId = posTable.getId(pos); From 0c8bb9b2c55350ce80ef42824f6ae074cfc9a79e Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Fri, 9 Sep 2022 17:33:19 +0900 Subject: [PATCH 11/94] writing new dictionary wip --- build.gradle | 4 +- .../sudachi/dictionary/CountCharsBench.java | 91 +++++++++++ .../dictionary/DoubleArrayLookupBench.java | 12 +- .../nlp/sudachi/dictionary/Download.java | 41 +++++ .../nlp/sudachi/dictionary/Blocks.java | 24 +++ .../nlp/sudachi/dictionary/CSVParser.java | 11 +- .../worksap/nlp/sudachi/dictionary/Ints.java | 9 ++ .../nlp/sudachi/dictionary/StringPtr.java | 3 + .../nlp/sudachi/dictionary/build/Align.java | 21 ++- .../dictionary/build/BlockHandler.java | 23 +++ .../sudachi/dictionary/build/BlockLayout.java | 58 +++++++ .../sudachi/dictionary/build/BlockOutput.java | 50 ++++++ .../sudachi/dictionary/build/BufWriter.java | 74 +++++++++ ...nicodeBuffer.java => ChanneledBuffer.java} | 47 +++++- .../sudachi/dictionary/build/DicBuilder2.java | 34 ++++- .../sudachi/dictionary/build/IOFunction.java | 24 +++ .../sudachi/dictionary/build/IOSupplier.java | 24 +++ .../nlp/sudachi/dictionary/build/Index.java | 79 ++++++++-- .../nlp/sudachi/dictionary/build/Lookup2.java | 8 +- .../dictionary/build/ProgressInputStream.java | 72 +++++++++ .../sudachi/dictionary/build/RawLexicon.java | 59 +++++++- .../dictionary/build/RawLexiconReader.java | 18 ++- .../dictionary/build/RawWordEntry.java | 96 +++++++++--- .../nlp/sudachi/dictionary/build/Stats.java | 32 ++++ .../dictionary/build/StringStorage.java | 13 +- .../dictionary/build/WordEntryLayout.java | 142 ++++++++++++++++++ .../build/WordInfoLayoutFixedWidth.java | 141 ----------------- .../sudachi/dictionary/build/WordLayout.java | 5 +- .../nlp/sudachi/dictionary/build/WordRef.java | 45 ++++-- .../dictionary/build/RawLexiconReaderTest.kt | 10 +- .../dictionary/build/StringStorageTest.kt | 4 +- .../java/com/worksap/nlp/sudachi/resources.kt | 2 +- .../sudachi/dictionary/build/headers-all.csv | 2 +- 33 files changed, 1043 insertions(+), 235 deletions(-) create mode 100644 src/jmh/java/com/worksap/nlp/sudachi/dictionary/CountCharsBench.java create mode 100644 src/jmh/java/com/worksap/nlp/sudachi/dictionary/Download.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/Blocks.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockHandler.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockOutput.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java rename src/main/java/com/worksap/nlp/sudachi/dictionary/build/{UnicodeBuffer.java => ChanneledBuffer.java} (59%) create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOFunction.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOSupplier.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/ProgressInputStream.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/Stats.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordInfoLayoutFixedWidth.java diff --git a/build.gradle b/build.gradle index 588b19ac..29697dec 100644 --- a/build.gradle +++ b/build.gradle @@ -11,7 +11,7 @@ plugins { id 'com.diffplug.spotless' version '6.9.1' id 'distribution' id 'signing' - id 'me.champeau.jmh' version "0.6.6" + id 'me.champeau.jmh' version "0.6.7" id 'io.github.gradle-nexus.publish-plugin' version "1.1.0" } @@ -27,7 +27,7 @@ dependencies { testImplementation 'org.hamcrest:hamcrest-library:2.2' testImplementation 'junit:junit:4.13.2' testImplementation 'org.openjdk.jmh:jmh-core:1.35' - testImplementation 'org.openjdk.jmh:jmh-generator-annprocess:1.35' + jmhAnnotationProcessor 'org.openjdk.jmh:jmh-generator-annprocess:1.35' testImplementation 'org.jetbrains.kotlin:kotlin-stdlib-jdk8:1.7.10' testImplementation 'org.jetbrains.kotlin:kotlin-test-junit:1.7.10' } diff --git a/src/jmh/java/com/worksap/nlp/sudachi/dictionary/CountCharsBench.java b/src/jmh/java/com/worksap/nlp/sudachi/dictionary/CountCharsBench.java new file mode 100644 index 00000000..4a582a87 --- /dev/null +++ b/src/jmh/java/com/worksap/nlp/sudachi/dictionary/CountCharsBench.java @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary; + +import org.openjdk.jmh.annotations.*; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.stream.Stream; + +@BenchmarkMode(Mode.Throughput) +@State(Scope.Benchmark) +@Warmup(time = 5, iterations = 3) +@Measurement(iterations = 7, time = 5) +@Fork(value = 1) +public class CountCharsBench { + private String[] data; + + @Setup + public void setup() throws IOException { + Path keysFile = Paths.get("build/darray/kwdlc.txt"); + Download.downloadIfNotExist(keysFile, + "https://github.com/ku-nlp/KWDLC/releases/download/release_1_0/leads.org.txt.gz", true); + try (Stream data = Files.lines(keysFile)) { + this.data = data.toArray(String[]::new); + } + } + + @Benchmark + @OperationsPerInvocation(15000) + public int naiveImpl() { + int count = 0; + char toFind = 'の'; + for (String s : data) { + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + if (c == toFind) { + count += 1; + } + } + } + return count; + } + + @Benchmark + @OperationsPerInvocation(15000) + public int indexOfImpl() { + int count = 0; + char toFind = 'の'; + for (String sequence : data) { + int idx = 0; + int end = sequence.length(); + while (idx < end) { + idx = sequence.indexOf(toFind, idx); + if (idx < 0) { + break; + } + idx += 1; + count += 1; + } + } + return count; + } + + @Benchmark + @OperationsPerInvocation(15000) + public int streamImpl() { + int count = 0; + char toFind = 'の'; + for (String sequence : data) { + count += sequence.chars().filter(c -> c == toFind).count(); + } + return count; + } +} diff --git a/src/jmh/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLookupBench.java b/src/jmh/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLookupBench.java index 17f98817..7d61fe09 100644 --- a/src/jmh/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLookupBench.java +++ b/src/jmh/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLookupBench.java @@ -56,16 +56,8 @@ public class DoubleArrayLookupBench { @Setup() public void setup() throws IOException { Path keysFile = Paths.get("build/darray/keys.txt"); - if (Files.notExists(keysFile)) { - // download from internet if not exists - Files.createDirectories(keysFile.getParent()); - // Sudachi Dictionary keys for all words (full dictionary) - URL keysUrl = new URL("https://github.com/eiennohito/xtime/releases/download/v0.0.1/keys.txt.gz"); - try (InputStream is = keysUrl.openStream()) { - GZIPInputStream gzipStream = new GZIPInputStream(is); - Files.copy(gzipStream, keysFile); - } - } + Download.downloadIfNotExist(keysFile, + "https://github.com/eiennohito/xtime/releases/download/v0.0.1/keys.txt.gz", true); keyCandidates = Files.lines(keysFile).map(l -> l.getBytes(StandardCharsets.UTF_8)).collect(Collectors.toList()); keyCandidates.sort((a, b) -> { int len = Math.min(a.length, b.length); diff --git a/src/jmh/java/com/worksap/nlp/sudachi/dictionary/Download.java b/src/jmh/java/com/worksap/nlp/sudachi/dictionary/Download.java new file mode 100644 index 00000000..f77f07bd --- /dev/null +++ b/src/jmh/java/com/worksap/nlp/sudachi/dictionary/Download.java @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.zip.GZIPInputStream; + +public class Download { + public static void downloadIfNotExist(Path file, String url, boolean gzip) throws IOException { + if (Files.exists(file)) { + return; + } + Files.createDirectories(file.getParent()); + URL toDownload = new URL(url); + try (InputStream is = toDownload.openStream()) { + InputStream stream = is; + if (gzip) { + stream = new GZIPInputStream(is); + } + Files.copy(stream, file); + } + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Blocks.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Blocks.java new file mode 100644 index 00000000..1021c872 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Blocks.java @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary; + +public class Blocks { + public static final String WORD_ID_TABLE = "WordIdTable"; + public static final String TRIE_INDEX = "TrieIndex"; + public static final String STRINGS = "Strings"; + public static final String ENTRIES = "Entries"; +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/CSVParser.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/CSVParser.java index bdc75acb..debeeec1 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/CSVParser.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/CSVParser.java @@ -28,7 +28,6 @@ import java.util.regex.Pattern; public class CSVParser implements Closeable { - static class Token { enum Type { @@ -59,6 +58,8 @@ enum Type { private boolean hasNextField = false; private int row = -1; + private String name = ""; + public CSVParser(Reader reader) { this.reader = new BufferedReader(reader); } @@ -240,4 +241,12 @@ private Token getToken() throws IOException { private void ungetToken(Token token) { tokenBuffer.push(token); } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } } \ No newline at end of file diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java index 05df687a..8c6123a7 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java @@ -18,6 +18,7 @@ import java.nio.ByteBuffer; import java.util.Arrays; +import java.util.StringJoiner; public class Ints { private int[] data; @@ -87,4 +88,12 @@ public static int[] readArray(ByteBuffer buffer, int len) { return result; } + @Override + public String toString() { + StringJoiner joiner = new StringJoiner(", ", "I[", "]"); + for (int i = 0; i < length; ++i) { + joiner.add(String.valueOf(data[i])); + } + return joiner.toString(); + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/StringPtr.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/StringPtr.java index 7c30053e..5c4b631d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/StringPtr.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/StringPtr.java @@ -88,6 +88,9 @@ public static int requiredAlignment(int length) { } static boolean isValid(int offset, int length) { + if (length < 0 || length > MAX_LENGTH) { + return false; + } int alignment = requiredAlignment(length); if (alignment == 0) { return true; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Align.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Align.java index 2368ac35..407bce59 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Align.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Align.java @@ -21,7 +21,7 @@ public class Align { private Align() { } - public static boolean isPowerOf2(int value) { + public static boolean isPowerOf2(long value) { return (value & value - 1) == 0; } @@ -35,14 +35,29 @@ public static boolean isPowerOf2(int value) { * @return aligned value, it should be greater or equal than the passed value */ public static int align(int value, int alignment) { + return (int) align((long) value, alignment); + } + + /** + * Aligns value to the alignment + * + * @param value + * value to be aligned + * @param alignment + * required alignment as a power of two + * @return aligned value, it should be greater or equal than the passed value + */ + public static long align(long value, long alignment) { assert isPowerOf2(alignment); + assert value >= 0; + // Compute alignment mask, it is the inverse of the mask for the bits that must // be 0 for alignment to be correct // Checking mask is computed as alignment - 1. E.g. 7 for alignment of 8, or 15 // for alignment of 16. // The second one is its inverse. - int bits = alignment - 1; - int mask = ~bits; + long bits = alignment - 1; + long mask = ~bits; return (value + bits) & mask; } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockHandler.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockHandler.java new file mode 100644 index 00000000..64b44aa1 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockHandler.java @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build; + +import java.io.IOException; + +public interface BlockHandler { + T apply(BlockOutput output) throws IOException; +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java new file mode 100644 index 00000000..63947e9f --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build; + +import java.io.IOException; +import java.nio.channels.SeekableByteChannel; +import java.util.ArrayList; +import java.util.List; + +public class BlockLayout { + private final SeekableByteChannel channel; + private final Progress progress; + + public BlockLayout(SeekableByteChannel channel, Progress progress) throws IOException { + this.channel = channel; + this.progress = progress; + channel.position(4096); + } + + public T block(String name, BlockHandler handler) throws IOException { + SeekableByteChannel chan = channel; + long start = chan.position(); + T result = handler.apply(new BlockOutput(chan, progress)); + long end = chan.position(); + long newPosition = Align.align(end, 4096); + chan.position(newPosition); + info.add(new BlockInfo(name, start, end)); + return result; + } + + private final static List info = new ArrayList<>(); + + private static class BlockInfo { + String name; + long start; + long end; + + public BlockInfo(String name, long start, long end) { + this.name = name; + this.start = start; + this.end = end; + } + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockOutput.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockOutput.java new file mode 100644 index 00000000..eb3c4e57 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockOutput.java @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build; + +import java.io.IOException; +import java.nio.channels.SeekableByteChannel; + +public class BlockOutput { + private SeekableByteChannel chan; + private Progress progress; + + private Stats stats; + + public BlockOutput(SeekableByteChannel chan, Progress progress) { + this.chan = chan; + this.progress = progress; + } + + public SeekableByteChannel getChannel() { + return chan; + } + + public Progress getProgress() { + return progress; + } + + public T measured(String name, IOFunction fun) throws IOException { + Progress p = progress; + long start = chan.position(); + p.startBlock(name, System.nanoTime(), Progress.Kind.OUTPUT); + T result = fun.apply(p); + long size = chan.position() - start; + p.endBlock(size, System.nanoTime()); + return result; + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java new file mode 100644 index 00000000..389d7675 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build; + +import com.worksap.nlp.sudachi.dictionary.Ints; + +import java.nio.ByteBuffer; + +public class BufWriter { + private final ByteBuffer buffer; + + public BufWriter(ByteBuffer buffer) { + this.buffer = buffer; + } + + public BufWriter putByte(byte val) { + buffer.put(val); + return this; + } + + // Encode int as LEB128 + public BufWriter putVarint32(int val) { + if (val <= 127) { + putByte((byte) val); + } else { + putVarintSlow(val); + } + return this; + } + + private void putVarintSlow(long val) { + while ((val & ~0x7fL) != 0) { + long b = 0x80 | (val & 0x7f); + putByte((byte) b); + val >>>= 7; + } + putByte((byte) val); + } + + public BufWriter putShort(short val) { + buffer.putShort(val); + return this; + } + + public BufWriter putInt(int val) { + buffer.putInt(val); + return this; + } + + public BufWriter putInts(Ints value, int length) { + ByteBuffer buf = buffer; + int pos = buf.position(); + for (int i = 0; i < length; ++i) { + buf.putInt(pos + i * 4, value.get(i)); + } + buf.position(pos + length * 4); + return this; + } + +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBuffer.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ChanneledBuffer.java similarity index 59% rename from src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBuffer.java rename to src/main/java/com/worksap/nlp/sudachi/dictionary/build/ChanneledBuffer.java index 8721db42..e1cb497b 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBuffer.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ChanneledBuffer.java @@ -22,17 +22,19 @@ import java.nio.CharBuffer; import java.nio.channels.WritableByteChannel; -public class UnicodeBuffer { +public class ChanneledBuffer { private final ByteBuffer buffer; private final WritableByteChannel channel; - public UnicodeBuffer(WritableByteChannel channel, int size) { + private int offset; + + public ChanneledBuffer(WritableByteChannel channel, int size) { this.channel = channel; this.buffer = ByteBuffer.allocate(size); buffer.order(ByteOrder.LITTLE_ENDIAN); } - public UnicodeBuffer(WritableByteChannel channel) { + public ChanneledBuffer(WritableByteChannel channel) { this(channel, 64 * 1024); } @@ -49,6 +51,7 @@ private CharBuffer prepare(int numChars) throws IOException { int remaining = buffer.remaining(); int byteLength = numChars * 2; if (remaining < byteLength) { + offset += buffer.position(); buffer.flip(); channel.write(buffer); buffer.clear(); @@ -61,7 +64,45 @@ private CharBuffer prepare(int numChars) throws IOException { return chars; } + public ByteBuffer byteBuffer(int maxLength) throws IOException { + ByteBuffer buf = buffer; + int remaining = buf.remaining(); + if (remaining < maxLength) { + offset += buf.position(); + buf.flip(); + channel.write(buf); + buf.clear(); + if (buf.remaining() < maxLength) { + throw new IllegalArgumentException(String.format( + "requested additionally: %d bytes, but the buffer size is %d", maxLength, buf.capacity())); + } + } + return buf; + } + + public BufWriter writer(int maxLength) throws IOException { + ByteBuffer buf = byteBuffer(maxLength); + return new BufWriter(buf); + } + public void flush() throws IOException { channel.write(buffer); + buffer.clear(); + } + + public int offset() { + return this.offset + buffer.position(); + } + + public int alignTo(int alignment) { + ByteBuffer buf = buffer; + int pos = buf.position(); + int aligned = Align.align(pos, alignment); + buf.position(aligned); + return aligned + offset; + } + + public void position(int newPosition) { + buffer.position(newPosition); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder2.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder2.java index 973f2e00..d9291ca6 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder2.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder2.java @@ -18,8 +18,11 @@ import java.io.IOException; import java.io.InputStream; -import java.util.function.Function; -import java.util.function.Supplier; +import java.nio.channels.SeekableByteChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardOpenOption; public class DicBuilder2 { private DicBuilder2() { @@ -29,18 +32,39 @@ private DicBuilder2() { public static class Base> { protected final POSTable pos = new POSTable(); protected final ConnectionMatrix connection = new ConnectionMatrix(); - protected final Index index = new Index(); protected Progress progress = Progress.NOOP; + protected RawLexicon lexicon = new RawLexicon(); @SuppressWarnings("unchecked") private T self() { return (T) this; } - public T lexicon(String name, Supplier input, long size) throws IOException { - + public T lexicon(String name, IOSupplier input, long size) throws IOException { + progress.startBlock(name, System.nanoTime(), Progress.Kind.INPUT); + try (InputStream is = input.get()) { + InputStream stream = new TrackingInputStream(is); + lexicon.read(name, stream, pos); + } + progress.endBlock(size, System.nanoTime()); return self(); } + + public void write(SeekableByteChannel channel) throws IOException { + BlockLayout layout = new BlockLayout(channel, progress); + lexicon.compile(pos, layout); + } } + public static void main(String[] args) throws IOException { + Base b = new Base<>(); + Path input = Paths.get(args[0]); + b.lexicon(input.getFileName().toString(), () -> Files.newInputStream(input), Files.size(input)); + Path output = Paths.get(args[1]); + Files.createDirectories(output.getParent()); + try (SeekableByteChannel chan = Files.newByteChannel(output, StandardOpenOption.WRITE, + StandardOpenOption.CREATE)) { + b.write(chan); + } + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOFunction.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOFunction.java new file mode 100644 index 00000000..fdfcda9a --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOFunction.java @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build; + +import java.io.IOException; + +@FunctionalInterface +public interface IOFunction { + R apply(T arg) throws IOException; +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOSupplier.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOSupplier.java new file mode 100644 index 00000000..ba7351e5 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOSupplier.java @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build; + +import java.io.IOException; + +@FunctionalInterface +public interface IOSupplier { + T get() throws IOException; +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java index 432882e2..651e7643 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java @@ -17,6 +17,8 @@ package com.worksap.nlp.sudachi.dictionary.build; import com.worksap.nlp.dartsclone.DoubleArray; +import com.worksap.nlp.sudachi.dictionary.Blocks; +import com.worksap.nlp.sudachi.dictionary.Ints; import java.io.IOException; import java.nio.ByteBuffer; @@ -28,7 +30,7 @@ * Dictionary Parts: Trie index and entry offsets */ public class Index implements WriteDictionary { - private final SortedMap> elements = new TreeMap<>((byte[] l, byte[] r) -> { + private final SortedMap elements = new TreeMap<>((byte[] l, byte[] r) -> { int llen = l.length; int rlen = r.length; for (int i = 0; i < Math.min(llen, rlen); i++) { @@ -43,11 +45,8 @@ public class Index implements WriteDictionary { public int add(String key, int wordId) { byte[] bytes = key.getBytes(StandardCharsets.UTF_8); - List entries = elements.computeIfAbsent(bytes, k -> new ArrayList<>()); - if (entries.size() >= 255) { - throw new IllegalArgumentException(String.format("key %s has >= 255 entries in the dictionary", key)); - } - entries.add(wordId); + Ints entries = elements.computeIfAbsent(bytes, k -> new Ints(4)); + entries.append(wordId); count += 1; return bytes.length; } @@ -65,13 +64,15 @@ public void writeTo(ModelOutput output) throws IOException { output.withSizedPart("WordId table", () -> { int i = 0; int numEntries = this.elements.entrySet().size(); - for (Map.Entry> entry : this.elements.entrySet()) { + for (Map.Entry entry : this.elements.entrySet()) { keys[i] = entry.getKey(); values[i] = wordIdTable.position(); i++; - List wordIds = entry.getValue(); - wordIdTable.put((byte) wordIds.size()); - for (int wid : wordIds) { + Ints wordIds = entry.getValue(); + int length = wordIds.length(); + wordIdTable.put((byte) length); + for (int word = 0; word < length; ++word) { + int wid = wordIds.get(word); wordIdTable.putInt(wid); } output.progress(i, numEntries); @@ -93,4 +94,62 @@ public void writeTo(ModelOutput output) throws IOException { wordIdTable.flip(); output.write(wordIdTable); } + + public void compile(BlockLayout layout) throws IOException { + TrieData data = layout.block(Blocks.WORD_ID_TABLE, this::writeWordTable); + layout.block(Blocks.TRIE_INDEX, data::writeTrie); + } + + private TrieData writeWordTable(BlockOutput out) throws IOException { + int size = this.elements.size(); + byte[][] keys = new byte[size][]; + int[] values = new int[size]; + ChanneledBuffer buffer = new ChanneledBuffer(out.getChannel()); + + out.measured("Word Id table", (p) -> { + int i = 0; + for (Map.Entry entry : this.elements.entrySet()) { + keys[i] = entry.getKey(); + values[i] = buffer.offset(); + i++; + Ints wordIds = entry.getValue(); + int length = wordIds.length(); + BufWriter buf = buffer.writer((length + 1) * 5); + + buf.putVarint32(length); + int prevWid = 0; + for (int word = 0; word < length; ++word) { + int wid = wordIds.get(word); + buf.putVarint32(wid - prevWid); + prevWid = wid; + } + p.progress(i, size); + } + return null; + }); + + buffer.flush(); + + return new TrieData(keys, values); + } + + private static class TrieData { + private final byte[][] keys; + private final int[] values; + + public TrieData(byte[][] keys, int[] values) { + this.keys = keys; + this.values = values; + } + + public Void writeTrie(BlockOutput block) throws IOException { + return block.measured("Trie Index", (p) -> { + DoubleArray trie = new DoubleArray(); + trie.build(keys, values, p::progress); + ByteBuffer buf = trie.byteArray().duplicate(); + block.getChannel().write(buf); + return null; + }); + } + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java index 56130b68..739209e7 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java @@ -30,7 +30,7 @@ public interface Entry { String headword(); } - public Lookup2(List entries) { + public Lookup2(List entries) { this.entries = entries; HashMap> result = new HashMap<>(entries.size() * 4 / 3); for (Entry e : entries) { @@ -40,7 +40,7 @@ public Lookup2(List entries) { bySurface = result; } - private final List entries; + private final List entries; private final Map> bySurface; public Entry byIndex(int index) { @@ -50,4 +50,8 @@ public Entry byIndex(int index) { public List byHeadword(String headword) { return bySurface.get(headword); } + + public void add(Entry e) { + bySurface.computeIfAbsent(e.headword(), x -> new ArrayList<>()).add(e); + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ProgressInputStream.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ProgressInputStream.java new file mode 100644 index 00000000..37087327 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ProgressInputStream.java @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build; + +import java.io.IOException; +import java.io.InputStream; + +public class ProgressInputStream extends InputStream { + private final InputStream inner; + private long position = 0; + private final long maxSize; + private final Progress progress; + + public ProgressInputStream(InputStream inner, long maxSize, Progress progress) { + this.inner = inner; + this.maxSize = maxSize; + this.progress = progress; + } + + @Override + public int read(byte[] b) throws IOException { + int nread = inner.read(b); + if (nread != -1) { + position += nread; + progress.progress(position, maxSize); + } + return nread; + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + int nread = inner.read(b, off, len); + if (nread != -1) { + position += nread; + progress.progress(position, maxSize); + } + return nread; + } + + @Override + public int available() throws IOException { + return inner.available(); + } + + @Override + public void close() throws IOException { + inner.close(); + } + + @Override + public int read() throws IOException { + int read = inner.read(); + if (read != -1) { + position += 1; + } + return read; + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index 2d590d09..2afd9af0 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -16,6 +16,7 @@ package com.worksap.nlp.sudachi.dictionary.build; +import com.worksap.nlp.sudachi.dictionary.Blocks; import com.worksap.nlp.sudachi.dictionary.CSVParser; import java.io.IOException; @@ -28,28 +29,35 @@ public class RawLexicon { private static final long MAX_OFFSET = Integer.MAX_VALUE * 8L; + private static final int INITIAL_OFFSET = 32; private final StringStorage strings = new StringStorage(); private final List entries = new ArrayList<>(); - private long offset = 0; + private final Index index = new Index(); + private boolean user; - public void read(InputStream data, POSTable posTable) throws IOException { - read(new InputStreamReader(data, StandardCharsets.UTF_8), posTable); + private long offset = INITIAL_OFFSET; + + public void read(String name, InputStream data, POSTable posTable) throws IOException { + read(name, new InputStreamReader(data, StandardCharsets.UTF_8), posTable); } - public void read(Reader data, POSTable posTable) throws IOException { + public void read(String name, Reader data, POSTable posTable) throws IOException { CSVParser parser = new CSVParser(data); - RawLexiconReader reader = new RawLexiconReader(parser, posTable); + parser.setName(name); + RawLexiconReader reader = new RawLexiconReader(parser, posTable, user); long offset = this.offset; RawWordEntry entry; while ((entry = reader.nextEntry()) != null) { - strings.add(entry.headword); - strings.add(entry.reading); + entry.publishStrings(strings); entries.add(entry); entry.pointer = pointer(offset); offset += entry.computeExpectedSize(); checkOffset(offset); + if (entry.shouldBeIndexed()) { + index.add(entry.headword, entry.pointer); + } } this.offset = offset; } @@ -66,4 +74,41 @@ public void checkOffset(long offset) { throw new IllegalArgumentException("passed dictionary is too large, Sudachi can't handle it"); } } + + public void compile(POSTable pos, BlockLayout layout) throws IOException { + index.compile(layout); + layout.block(Blocks.STRINGS, this::writeStrings); + layout.block(Blocks.ENTRIES, (p) -> writeEntries(pos, p)); + } + + private Void writeEntries(POSTable pos, BlockOutput blockOutput) throws IOException { + return blockOutput.measured("Word Entries", (p) -> { + List list = entries; + Lookup2 lookup = new Lookup2(list); + WordRef.Parser refParser = WordRef.parser(pos, !user, false); + ChanneledBuffer buf = new ChanneledBuffer(blockOutput.getChannel(), WordEntryLayout.MAX_LENGTH * 4); + buf.position(INITIAL_OFFSET); + WordEntryLayout layout = new WordEntryLayout(lookup, strings, refParser, buf); + int size = list.size(); + int ptr = pointer(INITIAL_OFFSET); + for (int i = 0; i < size; ++i) { + RawWordEntry e = list.get(i); + if (e.pointer != ptr) { + throw new IllegalStateException("expected entry pointer != actual pointer"); + } + size += e.addPhantomEntries(list, lookup); + ptr = layout.put(e); + p.progress(i, size); + } + return null; + }); + } + + private Void writeStrings(BlockOutput blockOutput) throws IOException { + return blockOutput.measured("Strings", (p) -> { + strings.compile(p); + strings.writeCompact(blockOutput.getChannel()); + return null; + }); + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index 2cb5aed1..de450165 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -29,7 +29,8 @@ public class RawLexiconReader { /** - * Enum order is in-csv order. If a header is present, fields will be reordered + * Enum order is in legacy csv order. If a header is present, fields will be + * reordered with respect to the header. */ public enum Column { Surface(true), LeftId(true), RightId(true), Cost(true), Writing(false), Pos1(true), Pos2(true), Pos3( @@ -48,11 +49,13 @@ public enum Column { private int[] mapping; private final CSVParser parser; private final POSTable posTable; + private final WordRef.Parser refParser; - public RawLexiconReader(CSVParser parser, POSTable pos) throws IOException { + public RawLexiconReader(CSVParser parser, POSTable pos, boolean user) throws IOException { this.parser = parser; this.posTable = pos; resolveColumnLayout(); + refParser = WordRef.parser(pos, mapping == null || !user, mapping == null); } private static final Pattern INTEGER_REGEX = Pattern.compile("^-?\\d+$"); @@ -133,8 +136,8 @@ private RawWordEntry convertEntry(List data) { entry.cost = getShort(data, Column.Cost); entry.reading = get(data, Column.ReadingForm, true); - entry.dictionaryFormRef = get(data, Column.DictionaryForm, false); - entry.normalizedFormRef = get(data, Column.NormalizedForm, false); + entry.dictionaryForm = refParser.parse(get(data, Column.DictionaryForm, false)); + entry.normalizedForm = refParser.parse(get(data, Column.NormalizedForm, false)); POS pos = new POS( // comment for line break @@ -151,6 +154,8 @@ private RawWordEntry convertEntry(List data) { entry.synonymGroups = get(data, Column.SynonymGroups, false); entry.userData = get(data, Column.UserData, true); + entry.validate(); + return entry; } @@ -164,6 +169,9 @@ record = parser.getNextRecord(); if (record == null) { return null; } - return convertEntry(record); + RawWordEntry entry = convertEntry(record); + entry.sourceLine = parser.getRow(); + entry.sourceName = parser.getName(); + return entry; } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java index e4bf9f29..2ab7df12 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java @@ -17,18 +17,20 @@ package com.worksap.nlp.sudachi.dictionary.build; import com.worksap.nlp.sudachi.StringUtil; +import com.worksap.nlp.sudachi.dictionary.StringPtr; import com.worksap.nlp.sudachi.dictionary.WordInfo; +import java.util.List; import java.util.Objects; @SuppressWarnings("jol") public class RawWordEntry implements Lookup2.Entry { + WordInfo wordInfo; int pointer; String headword; String reading; - String normalizedFormRef; - String dictionaryFormRef; - WordInfo wordInfo; + WordRef normalizedForm; + WordRef dictionaryForm; String aUnitSplitString; String bUnitSplitString; String cUnitSplitString; @@ -39,33 +41,40 @@ public class RawWordEntry implements Lookup2.Entry { short leftId; short rightId; short cost; - short surfaceUtf8Length; - int expectedSize = 0; short posId; + int sourceLine; + String sourceName; - private int countSplits(String data) { - return StringUtil.count(data, '/'); + private int countRefs(String data, String prev) { + if (data == null || data.isEmpty() || "*".equals(data) || data.equals(prev)) { + return 0; + } + int nsplits = StringUtil.count(data, '/'); + if (nsplits >= CsvLexicon.ARRAY_MAX_LENGTH) { + throw new CsvFieldException("maximum number of splits were exceeded"); + } + return nsplits + 1; } + /** + * Compute expected size of word entry when put in the binary dictionary. This + * function additionally validates length of split entries. + * + * @return expected binary size of this entry, in bytes, will be always >=32 + */ public int computeExpectedSize() { - if (expectedSize != 0) { - return expectedSize; - } - int size = 32; - size += countSplits(aUnitSplitString) * 4; - size += countSplits(bUnitSplitString) * 4; - size += countSplits(cUnitSplitString) * 4; - size += countSplits(wordStructureString) * 4; - size += wordInfo.getSynonymGroupIds().length * 4; + size += countRefs(cUnitSplitString, "") * 4; + size += countRefs(bUnitSplitString, cUnitSplitString) * 4; + size += countRefs(aUnitSplitString, bUnitSplitString) * 4; + size += countRefs(wordStructureString, aUnitSplitString) * 4; + size += countRefs(synonymGroups, "") * 4; if (userData.length() != 0) { size += 2 + userData.length() * 2; } size = Align.align(size, 8); - - expectedSize = size; return size; } @@ -85,11 +94,58 @@ public int pointer() { @Override public boolean matches(short posId, String reading) { - return wordInfo.getPOSId() == posId && Objects.equals(wordInfo.getReadingForm(), reading); + return this.posId == posId && Objects.equals(this.reading, reading); } @Override public String headword() { - return wordInfo.getSurface(); + return headword; + } + + private void checkString(String value, String name) { + if (value.length() > StringPtr.MAX_LENGTH) { + throw new CsvFieldException( + String.format("field %s had value which exceeded the maximum length %d (actual length: %d)", name, + StringPtr.MAX_LENGTH, value.length())); + } + } + + public void validate() { + checkString(headword, "headword"); + checkString(reading, "reading"); + } + + public void publishStrings(StringStorage strings) { + strings.add(headword); + strings.add(reading); + if (normalizedForm instanceof WordRef.Headword) { + WordRef.Headword normalized = (WordRef.Headword) normalizedForm; + strings.add(normalized.getHeadword()); + } + } + + public int addPhantomEntries(List list, Lookup2 lookup) { + if (normalizedForm instanceof WordRef.Headword) { + WordRef.Headword ref = (WordRef.Headword) normalizedForm; + if (lookup.byHeadword(ref.getHeadword()) != null) { + return 0; + } + RawWordEntry copy = new RawWordEntry(); + copy.headword = ref.getHeadword(); + copy.reading = copy.headword; + copy.userData = ""; + copy.leftId = -1; + copy.rightId = -1; + copy.cost = Short.MAX_VALUE; + copy.mode = "A"; + copy.posId = posId; + RawWordEntry last = list.get(list.size() - 1); + copy.pointer = RawLexicon.pointer(last.pointer * 8L + last.computeExpectedSize()); + list.add(copy); + lookup.add(copy); + return 1; + } else { + return 0; + } } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Stats.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Stats.java new file mode 100644 index 00000000..3c918609 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Stats.java @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build; + +import java.time.Duration; + +public class Stats { + public enum Kind { + Input, Output + } + + public static class Element { + public Kind kind; + public String name; + public long size; + public Duration duration; + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java index 2f0171de..7c1c50b7 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java @@ -29,7 +29,7 @@ import java.nio.file.StandardOpenOption; import java.util.*; -public class StringStorage { +public class StringStorage implements StringIndex { private final HashMap strings = new HashMap<>(); private final HashMap candidates = new HashMap<>(); private final WordLayout layout = new WordLayout(); @@ -38,13 +38,18 @@ void add(String data) { strings.put(data, null); } - void compile() { + void compile(Progress progress) { candidates.clear(); candidates.put("", new Item("", 0, 0)); List collect = new ArrayList<>(strings.keySet()); collect.sort(Comparator.comparingInt(String::length).reversed().thenComparing(String::compareTo)); - for (String str : collect) { + int size = collect.size(); + for (int i = 0; i < size; ++i) { + String str = collect.get(i); strings.put(str, process(str)); + if (progress != null) { + progress.progress(i, size); + } } candidates.clear(); } @@ -169,7 +174,7 @@ public static void main(String[] args) throws IOException { strings.add(record.get(12)); } } - strings.compile(); + strings.compile(null); Path fullName = Paths.get(args[1] + ".lpf"); try (SeekableByteChannel chan = Files.newByteChannel(fullName, StandardOpenOption.CREATE, diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java new file mode 100644 index 00000000..6f3e4253 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build; + +import com.worksap.nlp.sudachi.StringUtil; +import com.worksap.nlp.sudachi.dictionary.Ints; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.List; + +public class WordEntryLayout { + private final StringIndex index; + private final WordRef.Parser wordRefParser; + private final Lookup2 lookup; + private final ChanneledBuffer buffer; + private final Ints aSplits = new Ints(16); + private final Ints bSplits = new Ints(16); + private final Ints cSplits = new Ints(16); + private final Ints wordStructure = new Ints(16); + private final Ints synonymGroups = new Ints(16); + + public static final int MAX_LENGTH = 32 // basic size + + Byte.MAX_VALUE * 5 * 4 // splits and synonyms + + (Short.MAX_VALUE + 1) * 2; // user data + + public WordEntryLayout(Lookup2 resolver, StringIndex index, WordRef.Parser parser, ChanneledBuffer buffer) { + this.lookup = resolver; + this.index = index; + this.wordRefParser = parser; + this.buffer = buffer; + } + + public int put(RawWordEntry entry) throws IOException { + BufWriter buf = this.buffer.writer(MAX_LENGTH); + buf.putShort(entry.leftId); + buf.putShort(entry.rightId); + buf.putShort(entry.cost); + buf.putShort(entry.posId); + // 8 bytes + buf.putInt(index.resolve(entry.headword).encode()); // surfacePtr + buf.putInt(index.resolve(entry.reading).encode()); // readingPtr + int normFormPtr = 0; + if (entry.normalizedForm != null) { + normFormPtr = entry.normalizedForm.resolve(lookup); + } + int dicFormPtr = 0; + if (entry.dictionaryForm != null) { + dicFormPtr = entry.dictionaryForm.resolve(lookup); + } + buf.putInt(normFormPtr); // normalized entry + buf.putInt(dicFormPtr); // dictionary form + // 8 + 16 = 24 bytes + + byte cSplitLen = parseList(entry.cUnitSplitString, "", cSplits); + byte bSplitLen = parseList(entry.bUnitSplitString, entry.cUnitSplitString, bSplits); + byte aSplitLen = parseList(entry.aUnitSplitString, entry.bUnitSplitString, aSplits); + byte wordStructureLen = parseList(entry.wordStructureString, entry.aUnitSplitString, wordStructure); + byte synonymLen = parseIntList(entry.synonymGroups, synonymGroups); + + // length can't be more than ~4k utf-16 code units so the cast is safe + short utf8Len = (short) StringUtil.countUtf8Bytes(entry.headword); + buf.putShort(utf8Len); + buf.putByte(cSplitLen); + buf.putByte(bSplitLen); + buf.putByte(aSplitLen); + buf.putByte(wordStructureLen); + buf.putByte(synonymLen); + int userDataLength = entry.userData.length(); + buf.putByte(userDataLength == 0 ? (byte) 0 : (byte) 1); + // 24 + 8 = 32 bytes + + buf.putInts(cSplits, cSplitLen); + buf.putInts(bSplits, bSplitLen); + buf.putInts(aSplits, aSplitLen); + buf.putInts(wordStructure, wordStructureLen); + buf.putInts(synonymGroups, synonymLen); + + if (userDataLength != 0) { + buf.putShort((short) userDataLength); + String userData = entry.userData; + for (int i = 0; i < userDataLength; ++i) { + buf.putShort((short) userData.charAt(i)); + } + } + + int position = this.buffer.alignTo(8); + return RawLexicon.pointer(position); + } + + private byte parseIntList(String data, Ints result) { + if (data == null || data.isEmpty() || "*".equals(data)) { + result.clear(); + return 0; + } + String[] parts = data.split("/"); + if (parts.length > Byte.MAX_VALUE) { + throw new IllegalArgumentException("reference list contained more than 127 entries: " + data); + } + result.clear(); + for (String part : parts) { + result.append(Integer.parseInt(part)); + } + return (byte) parts.length; + } + + byte parseList(String data, String reference, Ints result) { + if (data == null || data.isEmpty() || "*".equals(data)) { + result.clear(); + return 0; + } + if (data.equals(reference)) { + result.clear(); + return -1; + } + String[] parts = data.split("/"); + if (parts.length > Byte.MAX_VALUE) { + throw new IllegalArgumentException("reference list contained more than 127 entries: " + data); + } + result.clear(); + for (String part : parts) { + WordRef ref = wordRefParser.parse(part); + result.append(ref.resolve(lookup)); + } + return (byte) parts.length; + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordInfoLayoutFixedWidth.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordInfoLayoutFixedWidth.java deleted file mode 100644 index 4fc20db6..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordInfoLayoutFixedWidth.java +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2022 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build; - -import com.worksap.nlp.sudachi.dictionary.Ints; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.List; - -public class WordInfoLayoutFixedWidth { - private final StringIndex index; - private final WordRef.Parser wordRefParser; - private final Lookup2 lookup; - private final ByteBuffer buffer = ByteBuffer.allocate(512 * 1024); - private int position; - private final Ints aSplits = new Ints(16); - private final Ints bSplits = new Ints(16); - private final Ints cSplits = new Ints(16); - private final Ints wordStructure = new Ints(16); - - public WordInfoLayoutFixedWidth(Lookup2 resolver, StringIndex index, WordRef.Parser parser) { - this.lookup = resolver; - this.index = index; - buffer.order(ByteOrder.LITTLE_ENDIAN); - wordRefParser = parser; - } - - public void process(RawWordEntry entry) { - - } - - public int put(RawWordEntry entry) { - int position = this.position + buffer.position(); - int entryPtr = position >>> 3; - ByteBuffer buf = this.buffer; - buf.putShort(entry.leftId); - buf.putShort(entry.rightId); - buf.putShort(entry.cost); - buf.putShort(entry.posId); - // 8 bytes - buf.putInt(index.resolve(entry.headword).encode()); // surfacePtr - buf.putInt(index.resolve(entry.reading).encode()); // readingPtr - int normFormPtr = wordRefParser.parse(entry.normalizedFormRef).resolve(lookup); - int dicFormPtr = wordRefParser.parse(entry.dictionaryFormRef).resolve(lookup); - buf.putInt(normFormPtr); // normalized entry - buf.putInt(dicFormPtr); // dictionary form - // 8 + 16 = 24 bytes - - byte aSplitLen = parseList(entry.aUnitSplitString, aSplits); - byte bSplitLen = parseList(entry.bUnitSplitString, bSplits); - byte cSplitLen = parseList(entry.cUnitSplitString, cSplits); - byte wordStructureLen = parseList(entry.wordStructureString, wordStructure); - byte synonymLen = (byte) entry.wordInfo.getSynonymGroupIds().length; - - buf.putShort(entry.surfaceUtf8Length); - buf.put(cSplitLen); - buf.put(bSplitLen); - buf.put(aSplitLen); - buf.put(wordStructureLen); - buf.put(synonymLen); - int userDataLength = entry.userData.length(); - buf.put(userDataLength != 0 ? (byte) 0 : (byte) 1); - // 24 + 8 = 32 bytes - - putInts(cSplits, cSplitLen); - putInts(bSplits, bSplitLen); - putInts(aSplits, aSplitLen); - putInts(wordStructure, wordStructureLen); - putInts(Ints.wrap(entry.wordInfo.getSynonymGroupIds()), synonymLen); - - if (userDataLength != 0) { - buf.putShort((short) userDataLength); - String userData = entry.userData; - for (int i = 0; i < userDataLength; ++i) { - buf.putShort((short) userData.charAt(i)); - } - } - - // align to 8 boundary - int currentPosition = buf.position(); - buf.position(Align.align(currentPosition, 8)); - - return entryPtr; - } - - private void putInts(Ints ints, int len) { - for (int i = 0; i < len; ++i) { - buffer.putInt(ints.get(i)); - } - } - - public void fillPointers(ByteBuffer data, List entries, Lookup2 lookup) { - for (int i = 0; i < entries.size(); i++) { - RawWordEntry entry = entries.get(i); - int offset = entry.pointer << 3; - data.position(offset + 8); - - data.putInt(index.resolve(entry.wordInfo.getSurface()).encode()); - data.putInt(index.resolve(entry.wordInfo.getReadingForm()).encode()); - data.putInt(entry.wordInfo.getDictionaryFormWordId()); - // data.putInt(entry.) - } - } - - public T consume(IOConsumer consumer) throws IOException { - position += buffer.position(); - buffer.flip(); - T result = consumer.accept(buffer); - buffer.clear(); - return result; - } - - byte parseList(String data, Ints result) { - String[] parts = data.split("/"); - if (parts.length > Byte.MAX_VALUE) { - throw new IllegalArgumentException("reference list contained more than 127 entries: " + data); - } - result.clear(); - for (String part : parts) { - WordRef ref = wordRefParser.parse(part); - result.append(ref.resolve(lookup)); - } - return (byte) parts.length; - } -} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java index 440d2993..513fc92e 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java @@ -109,7 +109,7 @@ private int allocateInBlock(int length, int alignment, int start, int end) { * Allocates a slot of {@code length} bytes, alignment with {@code alignment}. * It first considers free slots created by previous allocations, if none is * valid. - * + *

* Current implementation is prone to creating "holes" of 1-length, which are * almost impossible to fill from the usual dictionaries. Most emoji take 2 code * units and words which are not substrings of another word are usually longer. @@ -236,8 +236,7 @@ public int compareTo(FreeSpace o) { @Override public String toString() { - return new StringJoiner(", ", FreeSpace.class.getSimpleName() + "[", "]").add("start=" + start) - .add("length=" + length).toString(); + return new StringJoiner(", ", "FreeSpace[", "]").add("start=" + start).add("length=" + length).toString(); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java index f0aa8466..b8239118 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java @@ -21,6 +21,7 @@ import java.util.Arrays; import java.util.List; +import java.util.Objects; import java.util.regex.Pattern; /** @@ -44,6 +45,11 @@ public int getLine() { public int resolve(Lookup2 resolver) { return resolver.byIndex(line).pointer(); } + + @Override + public String toString() { + return String.format("WordRef/Line: %d", line); + } } public static final class Headword extends WordRef { @@ -62,6 +68,11 @@ public int resolve(Lookup2 resolver) { List entries = resolver.byHeadword(headword); return entries.get(0).pointer(); } + + @Override + public String toString() { + return String.format("WordRef/Headword: %s", headword); + } } public static final class Triple extends WordRef { @@ -97,25 +108,39 @@ public int resolve(Lookup2 resolver) { } return -1; } + + @Override + public String toString() { + return String.format("WordRef: %s/%d/%s", headword, posId, reading); + } } private static final Pattern NUMERIC_RE = Pattern.compile("^U?\\d+$"); + public static Parser parser(POSTable posTable, boolean allowNumeric, boolean allowHeadword) { + return new Parser(posTable, allowNumeric, allowHeadword); + } + public static class Parser { private final POSTable posTable; - private boolean strict; + private final boolean allowNumeric; + private final boolean allowHeadword; - public Parser(POSTable posTable, boolean strict) { + public Parser(POSTable posTable, boolean allowNumeric, boolean allowHeadword) { this.posTable = posTable; - this.strict = strict; + this.allowNumeric = allowNumeric; + this.allowHeadword = allowHeadword; } public WordRef parse(String text) { + if ("*".equals(text) || text == null || text.isEmpty()) { + return null; + } + if (NUMERIC_RE.matcher(text).matches()) { - if (strict) { - throw new CsvFieldException(String.format( - "invalid word reference: %s, numeric references are not supported in modern csv formats", - text)); + if (!allowNumeric) { + throw new CsvFieldException( + String.format("invalid word reference: %s, numeric references are not supported", text)); } int offset = text.charAt(0) == 'U' ? 1 : 0; int lineNum = Integer.parseInt(text.substring(offset)); @@ -135,12 +160,12 @@ public WordRef parse(String text) { return new Triple(headword, posId, reading); } - if (strict) { + if (allowHeadword) { + return new Headword(text); + } else { throw new CsvFieldException( String.format("invalid word reference: %s, it must contain POS tag and reading", text)); } - - return new Headword(text); } } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt index 814ca509..3ccc791d 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt @@ -16,8 +16,8 @@ package com.worksap.nlp.sudachi.dictionary.build -import com.worksap.nlp.sudachi.cps import com.worksap.nlp.sudachi.dictionary.CSVParser +import com.worksap.nlp.sudachi.resStream import kotlin.test.Test import kotlin.test.assertEquals import kotlin.test.assertNotNull @@ -26,14 +26,14 @@ import kotlin.test.assertNull class RawLexiconReaderTest { companion object { fun csv(name: String): CSVParser { - val stream = cps(name) + val stream = resStream(name) return CSVParser(stream.reader()) } } @Test fun legacyCsvWithMinimumFields() { - val reader = RawLexiconReader(csv("legacy-minimum.csv"), POSTable()) + val reader = RawLexiconReader(csv("legacy-minimum.csv"), POSTable(), false) assertNotNull(reader.nextEntry()).let { e -> assertEquals("東京都", e.headword) assertEquals("トウキョウト", e.reading) @@ -46,7 +46,7 @@ class RawLexiconReaderTest { @Test fun legacyCsvWithAllFields() { - val reader = RawLexiconReader(csv("legacy-full.csv"), POSTable()) + val reader = RawLexiconReader(csv("legacy-full.csv"), POSTable(), false) assertNotNull(reader.nextEntry()).let { e -> assertEquals("東京都", e.headword) assertEquals("トウキョウト", e.reading) @@ -61,7 +61,7 @@ class RawLexiconReaderTest { @Test fun headerCsvAllFields() { - val reader = RawLexiconReader(csv("headers-all.csv"), POSTable()) + val reader = RawLexiconReader(csv("headers-all.csv"), POSTable(), false) assertNotNull(reader.nextEntry()).let { e -> assertEquals("東京都", e.headword) assertEquals("トウキョウト", e.reading) diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringStorageTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringStorageTest.kt index 1a5abcf0..94c86b36 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringStorageTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringStorageTest.kt @@ -26,7 +26,7 @@ class StringStorageTest { val strs = StringStorage() strs.add("test") strs.add("es") - strs.compile() + strs.compile(null) val data = strs.strings assertEquals(2, data.size) assertEquals(1, data["es"]?.start) @@ -38,7 +38,7 @@ class StringStorageTest { val strs = StringStorage() strs.add("x") strs.add("y") - strs.compile() + strs.compile(null) val data = strs.strings assertEquals(2, data.size) assertEquals(0, data["x"]?.start) diff --git a/src/test/java/com/worksap/nlp/sudachi/resources.kt b/src/test/java/com/worksap/nlp/sudachi/resources.kt index 4206e9ed..d5fe5ad1 100644 --- a/src/test/java/com/worksap/nlp/sudachi/resources.kt +++ b/src/test/java/com/worksap/nlp/sudachi/resources.kt @@ -23,6 +23,6 @@ fun T.res(name: String): URL { return javaClass.getResource(name) ?: throw IllegalArgumentException("$name was not found") } -fun T.cps(name: String): InputStream { +fun T.resStream(name: String): InputStream { return res(name).openStream() } diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv index 2d38ebef..9b4b3ab3 100644 --- a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv @@ -1,2 +1,2 @@ Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,mode,splita,splitb,splitc,wordstructure,synonymgroups,userdata -東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,5/10,5/11,6/7,8/9,10 \ No newline at end of file +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,*,*,B,5/9,5/10,5/11,6/7,8/9,10 \ No newline at end of file From 1513027c9eb975bdb43468d3f89feb3d532b03a5 Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Mon, 12 Sep 2022 08:00:33 +0900 Subject: [PATCH 12/94] example reformat --- .../nlp/sudachi/DefaultInputTextPlugin.java | 2 +- .../nlp/sudachi/EditConnectionCostPlugin.java | 2 +- .../nlp/sudachi/IgnoreYomiganaPlugin.java | 2 +- .../nlp/sudachi/InhibitConnectionPlugin.java | 2 +- .../worksap/nlp/sudachi/InputTextPlugin.java | 2 +- .../nlp/sudachi/JoinKatakanaOovPlugin.java | 2 +- .../nlp/sudachi/JoinNumericPlugin.java | 2 +- .../nlp/sudachi/MeCabOovProviderPlugin.java | 2 +- .../nlp/sudachi/MorphemeFormatterPlugin.java | 2 +- .../nlp/sudachi/OovProviderPlugin.java | 2 +- .../nlp/sudachi/PathRewritePlugin.java | 2 +- .../ProlongedSoundMarkInputTextPlugin.java | 2 +- .../worksap/nlp/sudachi/RegexOovProvider.java | 2 +- .../com/worksap/nlp/sudachi/Settings.java | 34 +++++++++---------- .../nlp/sudachi/SimpleMorphemeFormatter.java | 2 +- .../nlp/sudachi/SimpleOovProviderPlugin.java | 2 +- .../sudachi/WordSegmentationFormatter.java | 2 +- 17 files changed, 33 insertions(+), 33 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/DefaultInputTextPlugin.java b/src/main/java/com/worksap/nlp/sudachi/DefaultInputTextPlugin.java index fc5eb860..7c49703c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/DefaultInputTextPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/DefaultInputTextPlugin.java @@ -51,7 +51,7 @@ * { * "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin", * "rewriteDef" : "rewrite.def" - * } + * } * } * * diff --git a/src/main/java/com/worksap/nlp/sudachi/EditConnectionCostPlugin.java b/src/main/java/com/worksap/nlp/sudachi/EditConnectionCostPlugin.java index 61dac1cc..ad4b00a0 100644 --- a/src/main/java/com/worksap/nlp/sudachi/EditConnectionCostPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/EditConnectionCostPlugin.java @@ -35,7 +35,7 @@ * { * "class" : "com.worksap.nlp.sudachi.SampleEditConnectionPlugin", * "example" : "example setting" - * } + * } * } * */ diff --git a/src/main/java/com/worksap/nlp/sudachi/IgnoreYomiganaPlugin.java b/src/main/java/com/worksap/nlp/sudachi/IgnoreYomiganaPlugin.java index 5a63e718..40d3c6af 100644 --- a/src/main/java/com/worksap/nlp/sudachi/IgnoreYomiganaPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/IgnoreYomiganaPlugin.java @@ -41,7 +41,7 @@ * "leftBrackets": ["(", "("], * "rightBrackets": [")", ")"], * "maxYomiganaLength": 4 - * } + * } * } * * diff --git a/src/main/java/com/worksap/nlp/sudachi/InhibitConnectionPlugin.java b/src/main/java/com/worksap/nlp/sudachi/InhibitConnectionPlugin.java index 54c4b2eb..23c4495d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/InhibitConnectionPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/InhibitConnectionPlugin.java @@ -35,7 +35,7 @@ * { * "class" : "com.worksap.nlp.sudachi.InhibitConnectionPlugin", * "inhibitedPair" : [ [ 0, 233 ], [435, 332] ] - * } + * } * } * * diff --git a/src/main/java/com/worksap/nlp/sudachi/InputTextPlugin.java b/src/main/java/com/worksap/nlp/sudachi/InputTextPlugin.java index d2d8dc4e..609b72a4 100644 --- a/src/main/java/com/worksap/nlp/sudachi/InputTextPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/InputTextPlugin.java @@ -34,7 +34,7 @@ * { * "class" : "com.worksap.nlp.sudachi.InputTextPlugin", * "example" : "example setting" - * } + * } * } * */ diff --git a/src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java b/src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java index a5d20c0b..88943475 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java @@ -39,7 +39,7 @@ * "class" : "com.worksap.nlp.sudachi.JoinKatakanaOovPlugin", * "oovPOS" : [ "POS1", "POS2", ... ], * "minLength" : 3 - * } + * } * } * */ diff --git a/src/main/java/com/worksap/nlp/sudachi/JoinNumericPlugin.java b/src/main/java/com/worksap/nlp/sudachi/JoinNumericPlugin.java index 5c33a1f3..1de96777 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JoinNumericPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/JoinNumericPlugin.java @@ -36,7 +36,7 @@ * { * "class" : "com.worksap.nlp.sudachi.JoinNumericPlugin", * "enableNormalize" : true, - * } + * } * } * * diff --git a/src/main/java/com/worksap/nlp/sudachi/MeCabOovProviderPlugin.java b/src/main/java/com/worksap/nlp/sudachi/MeCabOovProviderPlugin.java index 6abf5b3c..bfe23fcf 100644 --- a/src/main/java/com/worksap/nlp/sudachi/MeCabOovProviderPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/MeCabOovProviderPlugin.java @@ -42,7 +42,7 @@ * "class" : "com.worksap.nlp.sudachi.MeCabOovProviderPlugin", * "charDef" : "char.def", * "unkDef" : "unk.def" - * } + * } * } * * diff --git a/src/main/java/com/worksap/nlp/sudachi/MorphemeFormatterPlugin.java b/src/main/java/com/worksap/nlp/sudachi/MorphemeFormatterPlugin.java index 85d6ec56..a881183c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/MorphemeFormatterPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/MorphemeFormatterPlugin.java @@ -32,7 +32,7 @@ * "class" : "com.worksap.nlp.sudachi.MorphemeFormatterPlugin", * "delimiter" : "\n", * "eos" : "\nEOS\n", - * } + * } * } * * diff --git a/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java b/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java index ec232bdf..f4aa460c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java @@ -40,7 +40,7 @@ * { * "class" : "com.worksap.nlp.sudachi.OovProviderPlugin", * "example" : "example setting" - * } + * } * } * */ diff --git a/src/main/java/com/worksap/nlp/sudachi/PathRewritePlugin.java b/src/main/java/com/worksap/nlp/sudachi/PathRewritePlugin.java index 42e90ff1..536da9ad 100644 --- a/src/main/java/com/worksap/nlp/sudachi/PathRewritePlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/PathRewritePlugin.java @@ -40,7 +40,7 @@ * { * "class" : "com.worksap.nlp.sudachi.PathRewritePlugin", * "example" : "example setting" - * } + * } * } * */ diff --git a/src/main/java/com/worksap/nlp/sudachi/ProlongedSoundMarkInputTextPlugin.java b/src/main/java/com/worksap/nlp/sudachi/ProlongedSoundMarkInputTextPlugin.java index 5cab6c53..ed4b0b6e 100644 --- a/src/main/java/com/worksap/nlp/sudachi/ProlongedSoundMarkInputTextPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/ProlongedSoundMarkInputTextPlugin.java @@ -44,7 +44,7 @@ * "class" : "com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin", "prolongedSoundMarks": ["ー", "〜", "〰"], "replacementSymbol": "ー" - * } + * } * } * * diff --git a/src/main/java/com/worksap/nlp/sudachi/RegexOovProvider.java b/src/main/java/com/worksap/nlp/sudachi/RegexOovProvider.java index 86f940ec..6b6106e7 100644 --- a/src/main/java/com/worksap/nlp/sudachi/RegexOovProvider.java +++ b/src/main/java/com/worksap/nlp/sudachi/RegexOovProvider.java @@ -44,7 +44,7 @@ * "cost": 5000, * "maxLength": 32, * "boundaries": "relaxed" - * } + * } * } * * diff --git a/src/main/java/com/worksap/nlp/sudachi/Settings.java b/src/main/java/com/worksap/nlp/sudachi/Settings.java index 1bde0333..0ec45c90 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Settings.java +++ b/src/main/java/com/worksap/nlp/sudachi/Settings.java @@ -52,23 +52,23 @@ * "systemDict" : "system.dic", * "characterDefinitionFile" : "char.def", * "inputTextPlugin" : [ - * { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" } - * ], - * "oovProviderPlugin" : [ - * { - * "class" : "com.worksap.nlp.sudachi.MeCabOovProviderPlugin", - * "charDef" : "char.def", - * "unkDef" : "unk.def" - * }, - * { - * "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin", - * "oovPOSStrings" : [ "補助記号", "一般", "*", "*", "*", "*" ], - * "leftId" : 5968, - * "rightId" : 5968, - * "cost" : 3857 - * } - * ] - * } + * { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" } + * ], + * "oovProviderPlugin" : [ + * { + * "class" : "com.worksap.nlp.sudachi.MeCabOovProviderPlugin", + * "charDef" : "char.def", + * "unkDef" : "unk.def" + * }, + * { + * "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin", + * "oovPOSStrings" : [ "補助記号", "一般", "*", "*", "*", "*" ], + * "leftId" : 5968, + * "rightId" : 5968, + * "cost" : 3857 + * } + * ] + * } * } * *

diff --git a/src/main/java/com/worksap/nlp/sudachi/SimpleMorphemeFormatter.java b/src/main/java/com/worksap/nlp/sudachi/SimpleMorphemeFormatter.java index e9ac9958..99bcbbf7 100644 --- a/src/main/java/com/worksap/nlp/sudachi/SimpleMorphemeFormatter.java +++ b/src/main/java/com/worksap/nlp/sudachi/SimpleMorphemeFormatter.java @@ -32,7 +32,7 @@ * "delimiter" : "\n", * "eos" : "\nEOS\n", * "columnDelimiter" : "\t" - * } + * } * } * * diff --git a/src/main/java/com/worksap/nlp/sudachi/SimpleOovProviderPlugin.java b/src/main/java/com/worksap/nlp/sudachi/SimpleOovProviderPlugin.java index a0758950..50f7bb04 100644 --- a/src/main/java/com/worksap/nlp/sudachi/SimpleOovProviderPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/SimpleOovProviderPlugin.java @@ -38,7 +38,7 @@ * "leftId" : 5968, * "rigthId" : 5968, * "cost" : 3857 - * } + * } * } * * diff --git a/src/main/java/com/worksap/nlp/sudachi/WordSegmentationFormatter.java b/src/main/java/com/worksap/nlp/sudachi/WordSegmentationFormatter.java index 78545e8d..74c4dc48 100644 --- a/src/main/java/com/worksap/nlp/sudachi/WordSegmentationFormatter.java +++ b/src/main/java/com/worksap/nlp/sudachi/WordSegmentationFormatter.java @@ -32,7 +32,7 @@ * "class" : "com.worksap.nlp.sudachi.SurfaceFormatter", * "delimiter" : " ", * "eos" : "\n", - * } + * } * } * * From 628b243a6f6fc5c8c1540f21eb6af1bcff963374 Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Mon, 12 Sep 2022 08:40:59 +0900 Subject: [PATCH 13/94] don't modify positions with negative lengths --- .../com/worksap/nlp/sudachi/dictionary/build/BufWriter.java | 3 +++ .../com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java index 389d7675..d2081ea1 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java @@ -62,6 +62,9 @@ public BufWriter putInt(int val) { } public BufWriter putInts(Ints value, int length) { + if (length <= 0) { + return this; + } ByteBuffer buf = buffer; int pos = buf.position(); for (int i = 0; i < length; ++i) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index 2afd9af0..3bf8c214 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -94,7 +94,7 @@ private Void writeEntries(POSTable pos, BlockOutput blockOutput) throws IOExcept for (int i = 0; i < size; ++i) { RawWordEntry e = list.get(i); if (e.pointer != ptr) { - throw new IllegalStateException("expected entry pointer != actual pointer"); + throw new IllegalStateException("expected entry pointer != actual pointer, i=" + i); } size += e.addPhantomEntries(list, lookup); ptr = layout.put(e); From a06c0718c709b0a213dce7970c92bd328e9e818a Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Wed, 14 Sep 2022 16:36:33 +0900 Subject: [PATCH 14/94] first part of writing dictionary header --- .../nlp/sudachi/dictionary/Blocks.java | 4 +- .../nlp/sudachi/dictionary/BufReader.java | 117 ++++++++++++++ .../nlp/sudachi/dictionary/Description.java | 148 ++++++++++++++++++ .../worksap/nlp/sudachi/dictionary/POS.java | 1 + .../sudachi/dictionary/build/BufWriter.java | 35 ++++- .../dictionary/build/ConnectionMatrix.java | 11 ++ .../sudachi/dictionary/build/DicBuilder2.java | 82 +++++++++- .../nlp/sudachi/dictionary/build/Index.java | 19 ++- .../sudachi/dictionary/build/POSTable.java | 17 ++ .../sudachi/dictionary/build/RawLexicon.java | 5 +- .../nlp/sudachi/dictionary/BufReaderTest.kt | 128 +++++++++++++++ 11 files changed, 556 insertions(+), 11 deletions(-) create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/BufReader.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java create mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/BufReaderTest.kt diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Blocks.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Blocks.java index 1021c872..f9587a80 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Blocks.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Blocks.java @@ -17,8 +17,10 @@ package com.worksap.nlp.sudachi.dictionary; public class Blocks { - public static final String WORD_ID_TABLE = "WordIdTable"; + public static final String WORD_POINTERS = "WordPointers"; public static final String TRIE_INDEX = "TrieIndex"; public static final String STRINGS = "Strings"; public static final String ENTRIES = "Entries"; + public static final String CONNECTION_MATRIX = "ConnMatrix"; + public static final String POS_TABLE = "POS"; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/BufReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/BufReader.java new file mode 100644 index 00000000..dc1006d9 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/BufReader.java @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; + +public class BufReader { + private final ByteBuffer buffer; + + public BufReader(ByteBuffer buffer) { + this.buffer = buffer; + } + + long readLong() { + return buffer.getLong(); + } + + long readVarint64() { + ByteBuffer b = buffer; + int first = b.get() & 0xff; + if (first < 128) { + return first; + } else { + return readVarLongSlowpath(first & 0x7f, b); + } + } + + private static long readVarLongSlowpath(long v0, ByteBuffer b) { + long v1 = b.get() & 0xff; + if (v1 < 0x80) { + return (v1 << 7) | v0; + } + v0 |= (v1 & 0x7f) << 7; + long v2 = b.get() & 0xff; + if (v2 < 0x80) { + return (v2 << 14) | v0; + } + v0 |= (v2 & 0x7f) << 14; + long v3 = b.get() & 0xff; + if (v3 < 0x80) { + return (v3 << 21) | v0; + } + v0 |= (v3 & 0x7f) << 21; + long v4 = b.get() & 0xff; + if (v4 < 0x80) { + return (v4 << 28) | v0; + } + v0 |= (v4 & 0x7f) << 28; + long v5 = b.get() & 0xff; + if (v5 < 0x80) { + return (v5 << 35) | v0; + } + v0 |= (v5 & 0x7f) << 35; + long v6 = b.get() & 0xff; + if (v6 < 0x80) { + return (v6 << 42) | v0; + } + v0 |= (v6 & 0x7f) << 42; + long v7 = b.get() & 0xff; + if (v7 < 0x80) { + return (v7 << 49) | v0; + } + v0 |= (v7 & 0x7f) << 49; + long v8 = b.get() & 0xff; + if (v8 < 0x80) { // only 6 bits are valid here, rest must be 0 + return (v8 << 56) | v0; + } + v0 |= (v8 & 0x7f) << 56; + long v9 = b.get() & 0xff; + if (v9 < 0x07) { // only 3 bits are valid here, rest must be 0 + return (v8 << 61) | v0; + } + throw new IllegalStateException("invalid long varint encoding"); + } + + public int readVarint32() { + long l = readVarint64(); + if ((l & ~0xffff_ffffL) != 0) { + throw new IllegalStateException("invalid int varint encoding"); + } + return (int) l; + } + + public String readUtf8String() { + int length = readVarint32(); + if (buffer.remaining() < length) { + throw new IllegalStateException("invalid string exception, content underflow"); + } + if (buffer.hasArray()) { + byte[] arr = buffer.array(); + int offset = buffer.arrayOffset(); + int position = buffer.position(); + String s = new String(arr, offset + position, length, StandardCharsets.UTF_8); + buffer.position(position + length); + return s; + } else { + byte[] repr = new byte[length]; + buffer.get(repr); + return new String(repr, StandardCharsets.UTF_8); + } + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java new file mode 100644 index 00000000..4d655497 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.SeekableByteChannel; +import java.nio.charset.StandardCharsets; +import java.time.Instant; +import java.util.ArrayList; +import java.util.List; + +public class Description { + + private Instant creationTime; + + private String comment; + + private String signature; + + private String reference; + + private List blocks = new ArrayList<>(); + + public static class Block { + private String name; + private long start; + private long size; + } + + public static Description load(SeekableByteChannel channel) throws IOException { + ByteBuffer buf = ByteBuffer.allocate(4096); + buf.order(ByteOrder.LITTLE_ENDIAN); + if (channel.read(buf) == -1) { + throw new IllegalArgumentException("end of channel"); + } + buf.flip(); + return load(buf); + } + + public static Description load(ByteBuffer raw) { + checkLegacyDictionaryFormat(raw); + checkMagic(raw); + long version = raw.getLong(); + if (version == 1) { + return loadV1(raw); + } else { + throw new IllegalArgumentException(String.format("invalid version %d, corrupted dictionary", version)); + } + } + + private static Description loadV1(ByteBuffer raw) { + Description desc = new Description(); + BufReader reader = new BufReader(raw); + desc.creationTime = Instant.ofEpochSecond(reader.readLong()); + desc.comment = reader.readUtf8String(); + desc.signature = reader.readUtf8String(); + desc.reference = reader.readUtf8String(); + int length = reader.readVarint32(); + for (int i = 0; i < length; ++i) { + Block b = new Block(); + b.name = reader.readUtf8String(); + b.start = reader.readVarint64(); + b.size = reader.readVarint64(); + desc.blocks.add(b); + } + + return desc; + } + + public final static byte[] MAGIC_BYTES = "SudachiBinaryDic".getBytes(StandardCharsets.UTF_8); + + private static void checkMagic(ByteBuffer raw) { + assert MAGIC_BYTES.length == 16; + byte[] expected = new byte[MAGIC_BYTES.length]; + raw.get(expected); + for (int i = 0; i < expected.length; i++) { + if (MAGIC_BYTES[i] != expected[i]) { + throw new IllegalArgumentException("invalid magic string, dictionary is corrupted"); + } + } + } + + private static void checkLegacyDictionaryFormat(ByteBuffer raw) { + long version = raw.getLong(0); + if (DictionaryVersion.isSystemDictionary(version)) { + throw new IllegalArgumentException("passed dictionary is a legacy system dictionary, please rebuild it"); + } + if (DictionaryVersion.isUserDictionary(version)) { + throw new IllegalArgumentException("passed dictionary is a legacy user dictionary, please rebuild it"); + } + } + + public Instant getCreationTime() { + return creationTime; + } + + public void setCreationTime(Instant creationTime) { + this.creationTime = creationTime; + } + + public String getComment() { + return comment; + } + + public void setComment(String comment) { + this.comment = comment; + } + + public String getSignature() { + return signature; + } + + public void setSignature(String signature) { + this.signature = signature; + } + + public String getReference() { + return reference; + } + + public void setReference(String reference) { + this.reference = reference; + } + + public List getBlocks() { + return blocks; + } + + public void setBlocks(List blocks) { + this.blocks = blocks; + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/POS.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/POS.java index 9b659945..3ca8f32c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/POS.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/POS.java @@ -29,6 +29,7 @@ public final class POS extends AbstractList { public final static int DEPTH = 6; public final static int MAX_COMPONENT_LENGTH = 127; + public static final int MAX_BINARY_LENGTH = DEPTH * (MAX_COMPONENT_LENGTH + 1) * 2; private final String[] elems; /** diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java index d2081ea1..d89b2219 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java @@ -19,6 +19,7 @@ import com.worksap.nlp.sudachi.dictionary.Ints; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; public class BufWriter { private final ByteBuffer buffer; @@ -34,7 +35,16 @@ public BufWriter putByte(byte val) { // Encode int as LEB128 public BufWriter putVarint32(int val) { - if (val <= 127) { + if ((val & 0xff) == 0) { + putByte((byte) val); + } else { + putVarintSlow(val & 0xffff_ffffL); + } + return this; + } + + public BufWriter putVarint64(long val) { + if ((val & 0xff) == 0) { putByte((byte) val); } else { putVarintSlow(val); @@ -65,7 +75,7 @@ public BufWriter putInts(Ints value, int length) { if (length <= 0) { return this; } - ByteBuffer buf = buffer; + ByteBuffer buf = buffer; // read field only once int pos = buf.position(); for (int i = 0; i < length; ++i) { buf.putInt(pos + i * 4, value.get(i)); @@ -74,4 +84,25 @@ public BufWriter putInts(Ints value, int length) { return this; } + /** + * Put string which has length < Short.MAX_VALUE + * + * @param s + * string to put in the buffer + */ + public void putShortString(String s) { + int length = s.length(); + assert length < Short.MAX_VALUE; + putShort((short) length); + for (int i = 0; i < length; ++i) { + putShort((short) s.charAt(i)); + } + } + + public BufWriter putStringUtf8(String s) { + byte[] bytes = s.getBytes(StandardCharsets.UTF_8); + putVarint32(bytes.length); + buffer.put(bytes); + return this; + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java index 1101ed14..ec7a21a6 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java @@ -147,4 +147,15 @@ public short getNumLeft() { public short getNumRight() { return numRight; } + + public boolean nonEmpty() { + return numLeft > 0 || numRight > 0; + } + + public Void compile(BlockOutput out) throws IOException { + return out.measured("Connection Matrix", (p) -> { + out.getChannel().write(compiled.duplicate()); + return null; + }); + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder2.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder2.java index d9291ca6..d3ed1000 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder2.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder2.java @@ -16,20 +16,27 @@ package com.worksap.nlp.sudachi.dictionary.build; +import com.worksap.nlp.sudachi.dictionary.Blocks; +import com.worksap.nlp.sudachi.dictionary.DictionaryAccess; + import java.io.IOException; import java.io.InputStream; +import java.net.URL; +import java.net.URLConnection; import java.nio.channels.SeekableByteChannel; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardOpenOption; +import static java.lang.System.nanoTime; + public class DicBuilder2 { private DicBuilder2() { // no instances } - public static class Base> { + private static class Base> { protected final POSTable pos = new POSTable(); protected final ConnectionMatrix connection = new ConnectionMatrix(); protected Progress progress = Progress.NOOP; @@ -41,21 +48,90 @@ private T self() { } public T lexicon(String name, IOSupplier input, long size) throws IOException { - progress.startBlock(name, System.nanoTime(), Progress.Kind.INPUT); + progress.startBlock(name, nanoTime(), Progress.Kind.INPUT); try (InputStream is = input.get()) { InputStream stream = new TrackingInputStream(is); lexicon.read(name, stream, pos); } - progress.endBlock(size, System.nanoTime()); + progress.endBlock(size, nanoTime()); return self(); } + public T lexicon(URL url) throws IOException { + String name = url.getPath(); + URLConnection conn = url.openConnection(); + long size = conn.getContentLengthLong(); + return lexicon(name, conn::getInputStream, size); + } + + public T lexicon(Path path) throws IOException { + String name = path.getFileName().toString(); + long size = Files.size(path); + return lexicon(name, () -> Files.newInputStream(path), size); + } + public void write(SeekableByteChannel channel) throws IOException { BlockLayout layout = new BlockLayout(channel, progress); + if (connection.nonEmpty()) { + layout.block(Blocks.CONNECTION_MATRIX, connection::compile); + } + layout.block(Blocks.POS_TABLE, pos::compile); lexicon.compile(pos, layout); } } + public static final class System extends Base { + private System readMatrix(String name, IOSupplier input, long size) throws IOException { + progress.startBlock(name, nanoTime(), Progress.Kind.INPUT); + try (InputStream is = input.get()) { + InputStream stream = new ProgressInputStream(is, size, progress); + connection.readEntries(stream); + } + progress.endBlock(size, nanoTime()); + return this; + } + } + + public static final class SystemNoMatrix { + private final System inner; + + private SystemNoMatrix(DicBuilder2.System inner) { + this.inner = inner; + } + + public DicBuilder2.System matrix(String name, IOSupplier data, long size) throws IOException { + return inner.readMatrix(name, data, size); + } + + public DicBuilder2.System matrix(URL data) throws IOException { + String name = data.getPath(); + URLConnection conn = data.openConnection(); + long size = conn.getContentLengthLong(); + return matrix(name, conn::getInputStream, size); + } + + public DicBuilder2.System matrix(Path path) throws IOException { + String name = path.getFileName().toString(); + long size = Files.size(path); + return matrix(name, () -> Files.newInputStream(path), size); + } + } + + public static final class User extends Base { + private User(DictionaryAccess system) { + pos.preloadFrom(system.getGrammar()); + + } + } + + public static SystemNoMatrix system() { + return new SystemNoMatrix(new System()); + } + + public static User user(DictionaryAccess system) { + return new User(system); + } + public static void main(String[] args) throws IOException { Base b = new Base<>(); Path input = Paths.get(args[0]); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java index 651e7643..32bb85f6 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java @@ -95,16 +95,17 @@ public void writeTo(ModelOutput output) throws IOException { output.write(wordIdTable); } - public void compile(BlockLayout layout) throws IOException { - TrieData data = layout.block(Blocks.WORD_ID_TABLE, this::writeWordTable); + public void compile(BlockLayout layout, List notIndexed) throws IOException { + TrieData data = layout.block(Blocks.WORD_POINTERS, (o) -> writeWordTable(o, notIndexed)); layout.block(Blocks.TRIE_INDEX, data::writeTrie); } - private TrieData writeWordTable(BlockOutput out) throws IOException { + private TrieData writeWordTable(BlockOutput out, List notIndexed) throws IOException { int size = this.elements.size(); byte[][] keys = new byte[size][]; int[] values = new int[size]; - ChanneledBuffer buffer = new ChanneledBuffer(out.getChannel()); + ChanneledBuffer buffer = new ChanneledBuffer(out.getChannel(), + Math.max((notIndexed.size() + 16) * 5, 64 * 1024)); out.measured("Word Id table", (p) -> { int i = 0; @@ -125,6 +126,16 @@ private TrieData writeWordTable(BlockOutput out) throws IOException { } p.progress(i, size); } + // write non-indexed entries for being able to iterate over all word entries + int nis = notIndexed.size(); + BufWriter buf = buffer.writer((nis + 1) * 5); + buf.putVarint32(nis); + int prevId = 0; + for (Lookup2.Entry e : notIndexed) { + int wid = e.pointer(); + buf.putVarint32(wid - prevId); + prevId = wid; + } return null; }); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java index 18851928..7edf0724 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java @@ -76,4 +76,21 @@ public int ownedLength() { return table.size() - builtin; } + public Void compile(BlockOutput out) throws IOException { + return out.measured("POS Table", (p) -> { + ChanneledBuffer cbuf = new ChanneledBuffer(out.getChannel()); + cbuf.byteBuffer(2).putShort((short) table.size()); + for (int i = 0; i < table.size(); ++i) { + BufWriter writer = cbuf.writer(POS.MAX_BINARY_LENGTH); + POS pos = table.get(i); + for (String s : pos) { + // strings are always shorter than POS.MAX + writer.putShortString(s); + } + p.progress(i, table.size()); + } + return null; + }); + + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index 3bf8c214..64cc2f31 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -32,6 +32,7 @@ public class RawLexicon { private static final int INITIAL_OFFSET = 32; private final StringStorage strings = new StringStorage(); private final List entries = new ArrayList<>(); + private final List notIndexed = new ArrayList<>(); private final Index index = new Index(); private boolean user; @@ -57,6 +58,8 @@ public void read(String name, Reader data, POSTable posTable) throws IOException checkOffset(offset); if (entry.shouldBeIndexed()) { index.add(entry.headword, entry.pointer); + } else { + notIndexed.add(entry); } } this.offset = offset; @@ -76,7 +79,7 @@ public void checkOffset(long offset) { } public void compile(POSTable pos, BlockLayout layout) throws IOException { - index.compile(layout); + index.compile(layout, notIndexed); layout.block(Blocks.STRINGS, this::writeStrings); layout.block(Blocks.ENTRIES, (p) -> writeEntries(pos, p)); } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/BufReaderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/BufReaderTest.kt new file mode 100644 index 00000000..3cfc1a52 --- /dev/null +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/BufReaderTest.kt @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary + +import com.worksap.nlp.sudachi.dictionary.build.BufWriter +import java.nio.ByteBuffer +import kotlin.test.Test +import kotlin.test.assertEquals + +fun checkLong(x: Long) { + val bb = ByteBuffer.allocate(32) + val w = BufWriter(bb) + w.putVarint64(x) + bb.flip() + val r = BufReader(bb) + val y = r.readVarint64() + assertEquals(x, y) +} + +fun checkInt(x: Int) { + val bb = ByteBuffer.allocate(32) + val w = BufWriter(bb) + w.putVarint32(x) + bb.flip() + val r = BufReader(bb) + val y = r.readVarint32() + assertEquals(x, y) +} + +fun checkUtf8String(s: String) { + val bb = ByteBuffer.allocate(32) + val w = BufWriter(bb) + w.putStringUtf8(s) + bb.flip() + val r = BufReader(bb) + val y = r.readUtf8String() + assertEquals(s, y) +} + +class BufReaderTest { + @Test + fun varint64() { + checkLong(0L.inv()) + checkLong(0x0) + checkLong(0x1) + checkLong(0x80) + checkLong(0xff) + checkLong(0x4ff) + checkLong(0xfff) + checkLong(0x4fff) + checkLong(0xffff) + checkLong(0x4_ffff) + checkLong(0xf_ffff) + checkLong(0x4f_ffff) + checkLong(0xff_ffff) + checkLong(0x4ff_ffff) + checkLong(0xfff_ffff) + checkLong(0x4fff_ffff) + checkLong(0xffff_ffff) + checkLong(0x4_ffff_ffff) + checkLong(0xf_ffff_ffff) + checkLong(0x4f_ffff_ffff) + checkLong(0xff_ffff_ffff) + checkLong(0x4ff_ffff_ffff) + checkLong(0xfff_ffff_ffff) + checkLong(0x4fff_ffff_ffff) + checkLong(0xffff_ffff_ffff) + checkLong(0x4_ffff_ffff_ffff) + checkLong(0xf_ffff_ffff_ffff) + checkLong(0x4f_ffff_ffff_ffff) + checkLong(0xff_ffff_ffff_ffff) + checkLong(0x4ff_ffff_ffff_ffff) + checkLong(0xfff_ffff_ffff_ffff) + checkLong(0x4fff_ffff_ffff_ffff) + checkLong(0x5fff_ffff_ffff_ffff) + checkLong(0x6fff_ffff_ffff_ffff) + checkLong(0x7fff_ffff_ffff_ffff) + checkLong(0x1111_1111_1111_1111) + checkLong(0x2222_2222_2222_2222) + checkLong(0x3333_3333_3333_3333) + checkLong(0x5555_5555_5555_5555) + } + + @Test + fun varint32() { + checkInt(0.inv()) + checkInt(0x0) + checkInt(0x1) + checkInt(0x80) + checkInt(0xff) + checkInt(0x4ff) + checkInt(0xfff) + checkInt(0x4fff) + checkInt(0xffff) + checkInt(0x4_ffff) + checkInt(0xf_ffff) + checkInt(0x4f_ffff) + checkInt(0xff_ffff) + checkInt(0x4ff_ffff) + checkInt(0xfff_ffff) + checkInt(0x4fff_ffff) + } + + @Test + fun utf8String() { + checkUtf8String("") + checkUtf8String("test") + checkUtf8String("привет") + checkUtf8String("こんにちは") + checkUtf8String("東京都") + checkUtf8String("""👨‍👩‍👧‍👦""") + checkUtf8String("""t東e京s💞t都""") + } +} From fda2d481faf3272354829f9a8d309257769deae2 Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Wed, 14 Sep 2022 17:03:46 +0900 Subject: [PATCH 15/94] write dictionary header --- .../nlp/sudachi/dictionary/Description.java | 41 +++++++++++++++++-- .../sudachi/dictionary/build/BufWriter.java | 4 ++ .../sudachi/dictionary/build/DicBuilder2.java | 4 +- 3 files changed, 44 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java index 4d655497..a18bf6e7 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java @@ -16,24 +16,28 @@ package com.worksap.nlp.sudachi.dictionary; +import com.worksap.nlp.sudachi.dictionary.build.BufWriter; + import java.io.IOException; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.channels.SeekableByteChannel; import java.nio.charset.StandardCharsets; import java.time.Instant; +import java.time.format.DateTimeFormatter; import java.util.ArrayList; import java.util.List; +import java.util.Locale; +import java.util.Random; public class Description { - - private Instant creationTime; + private Instant creationTime = Instant.now(); private String comment; - private String signature; + private String signature = defaultSignature(creationTime); - private String reference; + private String reference; private List blocks = new ArrayList<>(); @@ -83,6 +87,30 @@ private static Description loadV1(ByteBuffer raw) { return desc; } + public void save(SeekableByteChannel channel) throws IOException { + ByteBuffer buff = ByteBuffer.allocate(4096); + buff.put(MAGIC_BYTES); + BufWriter writer = new BufWriter(buff); + writer.putLong(1); // version + writer.putVarint64(creationTime.getEpochSecond()); + writer.putStringUtf8(comment); + writer.putStringUtf8(signature); + writer.putStringUtf8(reference); + int length = blocks.size(); + writer.putVarint32(length); + for (Block b: blocks) { + writer.putStringUtf8(b.name); + writer.putVarint64(b.start); + writer.putVarint64(b.size); + } + + long pos = channel.position(); + channel.position(0); + buff.reset(); + channel.write(buff); + channel.position(pos); + } + public final static byte[] MAGIC_BYTES = "SudachiBinaryDic".getBytes(StandardCharsets.UTF_8); private static void checkMagic(ByteBuffer raw) { @@ -106,6 +134,11 @@ private static void checkLegacyDictionaryFormat(ByteBuffer raw) { } } + private String defaultSignature(Instant date) { + DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMMddHHmmss", Locale.ROOT); + return String.format("%s-%08x", formatter.format(date), new Random().nextLong()); + } + public Instant getCreationTime() { return creationTime; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java index d89b2219..5186030e 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java @@ -105,4 +105,8 @@ public BufWriter putStringUtf8(String s) { buffer.put(bytes); return this; } + + public void putLong(long x) { + buffer.putLong(x); + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder2.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder2.java index d3ed1000..e786c509 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder2.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder2.java @@ -17,6 +17,7 @@ package com.worksap.nlp.sudachi.dictionary.build; import com.worksap.nlp.sudachi.dictionary.Blocks; +import com.worksap.nlp.sudachi.dictionary.Description; import com.worksap.nlp.sudachi.dictionary.DictionaryAccess; import java.io.IOException; @@ -41,6 +42,7 @@ private static class Base> { protected final ConnectionMatrix connection = new ConnectionMatrix(); protected Progress progress = Progress.NOOP; protected RawLexicon lexicon = new RawLexicon(); + protected final Description description = new Description(); @SuppressWarnings("unchecked") private T self() { @@ -120,7 +122,7 @@ public DicBuilder2.System matrix(Path path) throws IOException { public static final class User extends Base { private User(DictionaryAccess system) { pos.preloadFrom(system.getGrammar()); - + description.setSignature(""); } } From 3c4754a77048ac5af9819773a6eff0834ca60e95 Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Thu, 15 Sep 2022 09:26:48 +0900 Subject: [PATCH 16/94] clean up BufReaderTest --- .../com/worksap/nlp/sudachi/Settings.java | 1 - .../nlp/sudachi/dictionary/Description.java | 6 ++-- .../sudachi/dictionary/build/BufWriter.java | 2 +- .../nlp/sudachi/dictionary/BufReaderTest.kt | 34 ++++++------------- 4 files changed, 14 insertions(+), 29 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/Settings.java b/src/main/java/com/worksap/nlp/sudachi/Settings.java index 0ec45c90..94c6b014 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Settings.java +++ b/src/main/java/com/worksap/nlp/sudachi/Settings.java @@ -146,7 +146,6 @@ public Settings read(URL resource) throws IOException { /** * Read a settings from a JSON string. - *

* * @param path * will add additional {@link PathAnchor} to this path if not diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java index a18bf6e7..b0e07c95 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java @@ -37,7 +37,7 @@ public class Description { private String signature = defaultSignature(creationTime); - private String reference; + private String reference; private List blocks = new ArrayList<>(); @@ -98,7 +98,7 @@ public void save(SeekableByteChannel channel) throws IOException { writer.putStringUtf8(reference); int length = blocks.size(); writer.putVarint32(length); - for (Block b: blocks) { + for (Block b : blocks) { writer.putStringUtf8(b.name); writer.putVarint64(b.start); writer.putVarint64(b.size); @@ -111,7 +111,7 @@ public void save(SeekableByteChannel channel) throws IOException { channel.position(pos); } - public final static byte[] MAGIC_BYTES = "SudachiBinaryDic".getBytes(StandardCharsets.UTF_8); + private final static byte[] MAGIC_BYTES = "SudachiBinaryDic".getBytes(StandardCharsets.UTF_8); private static void checkMagic(ByteBuffer raw) { assert MAGIC_BYTES.length == 16; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java index 5186030e..f8688134 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java @@ -85,7 +85,7 @@ public BufWriter putInts(Ints value, int length) { } /** - * Put string which has length < Short.MAX_VALUE + * Put string which has length is shorter than Short.MAX_VALUE * * @param s * string to put in the buffer diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/BufReaderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/BufReaderTest.kt index 3cfc1a52..a6cfafae 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/BufReaderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/BufReaderTest.kt @@ -21,39 +21,23 @@ import java.nio.ByteBuffer import kotlin.test.Test import kotlin.test.assertEquals -fun checkLong(x: Long) { +inline fun check( + crossinline fin: (BufWriter, T) -> Unit, + crossinline fout: (BufReader) -> T +): (T) -> Unit = { val bb = ByteBuffer.allocate(32) val w = BufWriter(bb) - w.putVarint64(x) + fin(w, it) bb.flip() val r = BufReader(bb) - val y = r.readVarint64() - assertEquals(x, y) -} - -fun checkInt(x: Int) { - val bb = ByteBuffer.allocate(32) - val w = BufWriter(bb) - w.putVarint32(x) - bb.flip() - val r = BufReader(bb) - val y = r.readVarint32() - assertEquals(x, y) -} - -fun checkUtf8String(s: String) { - val bb = ByteBuffer.allocate(32) - val w = BufWriter(bb) - w.putStringUtf8(s) - bb.flip() - val r = BufReader(bb) - val y = r.readUtf8String() - assertEquals(s, y) + val y = fout(r) + assertEquals(it, y) } class BufReaderTest { @Test fun varint64() { + val checkLong = check({ w, x -> w.putVarint64(x) }, { it.readVarint64() }) checkLong(0L.inv()) checkLong(0x0) checkLong(0x1) @@ -97,6 +81,7 @@ class BufReaderTest { @Test fun varint32() { + val checkInt = check({ w, x -> w.putVarint32(x) }, { it.readVarint32() }) checkInt(0.inv()) checkInt(0x0) checkInt(0x1) @@ -117,6 +102,7 @@ class BufReaderTest { @Test fun utf8String() { + val checkUtf8String = check({ w, x -> w.putStringUtf8(x) }, { it.readUtf8String() }) checkUtf8String("") checkUtf8String("test") checkUtf8String("привет") From 75c2b6333a18aeddd0508935c7985bc131b96fb4 Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Tue, 18 Oct 2022 20:10:08 +0900 Subject: [PATCH 17/94] dictionary new format wip --- docs/changes.md | 5 + .../nlp/sudachi/JapaneseDictionary.java | 2 +- .../nlp/sudachi/JapaneseTokenizer.java | 18 +- .../nlp/sudachi/JoinKatakanaOovPlugin.java | 7 +- .../nlp/sudachi/JoinNumericPlugin.java | 22 +- .../java/com/worksap/nlp/sudachi/Lattice.java | 2 +- .../com/worksap/nlp/sudachi/LatticeImpl.java | 27 +- .../worksap/nlp/sudachi/LatticeNodeImpl.java | 221 ++++++++++-- .../nlp/sudachi/MeCabOovProviderPlugin.java | 42 +-- .../com/worksap/nlp/sudachi/Morpheme.java | 4 +- .../com/worksap/nlp/sudachi/MorphemeImpl.java | 47 +-- .../com/worksap/nlp/sudachi/MorphemeList.java | 22 +- .../nlp/sudachi/OovProviderPlugin.java | 11 +- .../nlp/sudachi/PathRewritePlugin.java | 49 ++- .../com/worksap/nlp/sudachi/PosMatcher.java | 2 +- .../worksap/nlp/sudachi/RegexOovProvider.java | 20 +- .../nlp/sudachi/SimpleOovProviderPlugin.java | 21 +- .../java/com/worksap/nlp/sudachi/WordId.java | 21 ++ .../sudachi/dictionary/BinaryDictionary.java | 31 +- .../nlp/sudachi/dictionary/BufReader.java | 6 +- .../sudachi/dictionary/CompactedStrings.java | 23 ++ .../nlp/sudachi/dictionary/Connection.java | 13 + .../nlp/sudachi/dictionary/Description.java | 128 ++++++- .../sudachi/dictionary/DictionaryBuilder.java | 9 +- .../sudachi/dictionary/DictionaryPrinter.java | 194 ++++++++--- .../dictionary/DoubleArrayLexicon.java | 130 ++++--- .../nlp/sudachi/dictionary/GrammarImpl.java | 5 + .../worksap/nlp/sudachi/dictionary/Ints.java | 44 ++- .../nlp/sudachi/dictionary/Lexicon.java | 71 +--- .../nlp/sudachi/dictionary/LexiconSet.java | 42 +-- .../dictionary/UserDictionaryBuilder.java | 2 +- .../nlp/sudachi/dictionary/WordIdTable.java | 74 ++-- .../nlp/sudachi/dictionary/WordInfo.java | 189 ++++++----- .../nlp/sudachi/dictionary/WordInfoList.java | 85 +---- .../sudachi/dictionary/WordParameters.java | 51 +++ .../sudachi/dictionary/build/BlockLayout.java | 19 +- .../sudachi/dictionary/build/BufWriter.java | 4 +- .../dictionary/build/ChanneledBuffer.java | 1 + .../sudachi/dictionary/build/CsvLexicon.java | 61 +--- .../sudachi/dictionary/build/DicBuilder.java | 320 +++++++++++------- .../sudachi/dictionary/build/DicBuilder2.java | 148 -------- .../dictionary/build/InMemoryChannel.java | 7 +- .../sudachi/dictionary/build/POSTable.java | 1 + .../sudachi/dictionary/build/RawLexicon.java | 16 + .../sudachi/dictionary/build/WordLookup.java | 119 ------- .../sudachi/JoinKatakanaOovPluginTest.java | 16 +- .../nlp/sudachi/JoinNumericPluginTest.java | 80 ++--- .../sudachi/MeCabOovProviderPluginTest.java | 12 +- .../com/worksap/nlp/sudachi/TestDictionary.kt | 2 +- .../com/worksap/nlp/sudachi/TestLattice.java | 11 + .../nlp/sudachi/dictionary/BufReaderTest.kt | 13 + .../nlp/sudachi/dictionary/DescriptionTest.kt | 29 ++ .../dictionary/DictionaryBuilderTest.java | 21 +- .../sudachi/dictionary/DictionaryReader.java | 42 --- .../dictionary/DoubleArrayLexiconTest.java | 11 +- .../dictionary/UserDictionaryBuilderTest.java | 19 +- .../sudachi/dictionary/build/SystemDicTest.kt | 72 ++-- .../sudachi/dictionary/build/UserDicTest.kt | 19 +- .../java/com/worksap/nlp/sudachi/morphemes.kt | 27 ++ 59 files changed, 1510 insertions(+), 1200 deletions(-) create mode 100644 docs/changes.md create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/CompactedStrings.java create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameters.java delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder2.java create mode 100644 src/test/java/com/worksap/nlp/sudachi/TestLattice.java create mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/DescriptionTest.kt delete mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryReader.java create mode 100644 src/test/java/com/worksap/nlp/sudachi/morphemes.kt diff --git a/docs/changes.md b/docs/changes.md new file mode 100644 index 00000000..a2fb24ca --- /dev/null +++ b/docs/changes.md @@ -0,0 +1,5 @@ +# 0.8.0 + +### ABI-incompatible +* `Morpheme.partOfSpeech` returns `POS` object instead of `List` +* `Lexicon.getCost`, `Lexicon.getLeftId`, `Lexicon.getRightId` are replaced with `lexicon.parameters` which returns all three values packed, at once. \ No newline at end of file diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java index 5b598a07..2da590d1 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java +++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java @@ -98,7 +98,7 @@ void addUserDictionary(BinaryDictionary dictionary) { DoubleArrayLexicon userLexicon = dictionary.getLexicon(); Tokenizer tokenizer = new JapaneseTokenizer(grammar, lexicon, inputTextPlugins, oovProviderPlugins, Collections.emptyList()); - userLexicon.calculateCost(tokenizer); + userLexicon.calculateDynamicCosts(tokenizer); lexicon.add(userLexicon, (short) grammar.getPartOfSpeechSize()); grammar.addPosList(dictionary.getGrammar()); diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java index 201d59b5..d9909314 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java @@ -173,7 +173,7 @@ MorphemeList tokenizeSentence(Tokenizer.SplitMode mode, UTF8InputText input) { jsonBuilder.add("lattice", lattice.toJson()); } - List path = lattice.getBestPath(); + List path = lattice.getBestPath(); if (dumpOutput != null) { dumpOutput.println("=== Before rewriting:"); @@ -224,8 +224,7 @@ LatticeImpl buildLattice(UTF8InputText input) { int[] wordIds = wordLookup.getWordsIds(); for (int word = 0; word < numWords; ++word) { int wordId = wordIds[word]; - LatticeNodeImpl n = new LatticeNodeImpl(lexicon, lexicon.getLeftId(wordId), - lexicon.getRightId(wordId), lexicon.getCost(wordId), wordId); + LatticeNodeImpl n = new LatticeNodeImpl(lexicon, lexicon.parameters(wordId), wordId); lattice.insert(byteBoundary, end, n); unkNodes.add(n); wordMask = WordMask.addNth(wordMask, end - byteBoundary); @@ -266,16 +265,15 @@ private long provideOovs(OovProviderPlugin plugin, UTF8InputText input, ArrayLis return wordMask; } - private List splitPath(List path, SplitMode mode) { - List newPath = new ArrayList<>(); - for (LatticeNode node : path) { - LatticeNodeImpl nodeImpl = (LatticeNodeImpl) node; - nodeImpl.appendSplitsTo(newPath, mode); + private List splitPath(List path, SplitMode mode) { + List newPath = new ArrayList<>(); + for (LatticeNodeImpl node : path) { + node.appendSplitsTo(newPath, mode); } return newPath; } - void dumpPath(List path) { + void dumpPath(List path) { int i = 0; for (LatticeNode node : path) { dumpOutput.printf("%d: %s\n", i, node.toString()); @@ -283,7 +281,7 @@ void dumpPath(List path) { } } - JsonArrayBuilder pathToJson(List path, LatticeImpl lattice) { + JsonArrayBuilder pathToJson(List path, LatticeImpl lattice) { JsonArrayBuilder builder = Json.createArrayBuilder(); for (LatticeNode node : path) { builder.add(lattice.nodeToJson((LatticeNodeImpl) node)); diff --git a/src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java b/src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java index 88943475..d00fc3cd 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java @@ -47,6 +47,7 @@ class JoinKatakanaOovPlugin extends PathRewritePlugin { short oovPosId; int minLength; + private LatticeNodeImpl.OOVFactory factory; @Override public void setUp(Grammar grammar) { @@ -62,10 +63,12 @@ public void setUp(Grammar grammar) { if (minLength < 0) { throw new IllegalArgumentException("minLength is negative"); } + + factory = LatticeNodeImpl.oovFactory((short) -1, (short) -1, (short) -1, oovPosId); } @Override - public void rewrite(InputText text, List path, Lattice lattice) { + public void rewrite(InputText text, List path, Lattice lattice) { for (int i = 0; i < path.size(); i++) { LatticeNode node = path.get(i); if ((node.isOOV() || isShorter(minLength, text, node)) && isKatakanaNode(text, node)) { @@ -89,7 +92,7 @@ public void rewrite(InputText text, List path, Lattice lattice) { begin++; } if (end - begin > 1) { - concatenateOov(path, begin, end, oovPosId, lattice); + concatenateOov(path, begin, end, factory, lattice); i = begin + 1; } } diff --git a/src/main/java/com/worksap/nlp/sudachi/JoinNumericPlugin.java b/src/main/java/com/worksap/nlp/sudachi/JoinNumericPlugin.java index 1de96777..ad74ff43 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JoinNumericPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/JoinNumericPlugin.java @@ -16,12 +16,12 @@ package com.worksap.nlp.sudachi; -import java.util.Arrays; -import java.util.List; -import java.util.Set; - import com.worksap.nlp.sudachi.dictionary.CategoryType; import com.worksap.nlp.sudachi.dictionary.Grammar; +import com.worksap.nlp.sudachi.dictionary.POS; + +import java.util.List; +import java.util.Set; /** * A plugin for concatenation of the numerics. @@ -47,7 +47,7 @@ */ class JoinNumericPlugin extends PathRewritePlugin { - static final List NUMERIC_POS = Arrays.asList("名詞", "数詞", "*", "*", "*", "*"); + static final POS NUMERIC_POS = new POS("名詞", "数詞", "*", "*", "*", "*"); boolean enableNormalize; short numericPOSId; @@ -59,16 +59,16 @@ public void setUp(Grammar grammar) { } @Override - public void rewrite(InputText text, List path, Lattice lattice) { + public void rewrite(InputText text, List path, Lattice lattice) { int beginIndex = -1; boolean commaAsDigit = true; boolean periodAsDigit = true; NumericParser parser = new NumericParser(); for (int i = 0; i < path.size(); i++) { - LatticeNode node = path.get(i); + LatticeNodeImpl node = path.get(i); Set types = getCharCategoryTypes(text, node); - String s = node.getWordInfo().getNormalizedForm(); + String s = node.getNormalizedForm(); if (types.contains(CategoryType.NUMERIC) || types.contains(CategoryType.KANJINUMERIC) || (periodAsDigit && s.equals(".")) || (commaAsDigit && s.equals(","))) { @@ -99,7 +99,7 @@ public void rewrite(InputText text, List path, Lattice lattice) { concat(path, beginIndex, i, lattice, parser); i = beginIndex + 1; } else { - String ss = path.get(i - 1).getWordInfo().getNormalizedForm(); + String ss = path.get(i - 1).getNormalizedForm(); if ((parser.errorState == NumericParser.Error.COMMA && ss.equals(",")) || (parser.errorState == NumericParser.Error.POINT && ss.equals("."))) { concat(path, beginIndex, i - 1, lattice, parser); @@ -121,7 +121,7 @@ public void rewrite(InputText text, List path, Lattice lattice) { if (parser.done()) { concat(path, beginIndex, path.size(), lattice, parser); } else { - String ss = path.get(path.size() - 1).getWordInfo().getNormalizedForm(); + String ss = path.get(path.size() - 1).getNormalizedForm(); if ((parser.errorState == NumericParser.Error.COMMA && ss.equals(",")) || (parser.errorState == NumericParser.Error.POINT && ss.equals("."))) { concat(path, beginIndex, path.size() - 1, lattice, parser); @@ -130,7 +130,7 @@ public void rewrite(InputText text, List path, Lattice lattice) { } } - private void concat(List path, int begin, int end, Lattice lattice, NumericParser parser) { + private void concat(List path, int begin, int end, Lattice lattice, NumericParser parser) { if (path.get(begin).getWordInfo().getPOSId() != numericPOSId) return; if (enableNormalize) { diff --git a/src/main/java/com/worksap/nlp/sudachi/Lattice.java b/src/main/java/com/worksap/nlp/sudachi/Lattice.java index 09f3d0f1..479c3c65 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Lattice.java +++ b/src/main/java/com/worksap/nlp/sudachi/Lattice.java @@ -72,7 +72,7 @@ public interface Lattice { * the index to after the last position in the input text * @return the node which start at {@code begin} and end at {@code end} */ - public Optional getMinimumNode(int begin, int end); + public LatticeNodeImpl getMinimumNode(int begin, int end); /** * Insert the node at the specified index. diff --git a/src/main/java/com/worksap/nlp/sudachi/LatticeImpl.java b/src/main/java/com/worksap/nlp/sudachi/LatticeImpl.java index 0c2da783..6eda4c0c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/LatticeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/LatticeImpl.java @@ -46,9 +46,9 @@ class LatticeImpl implements Lattice { endLists = new ArrayList<>(); LatticeNodeImpl bosNode = new LatticeNodeImpl(); + bosNode.bestPreviousNode = bosNode; short[] bosParams = grammar.getBOSParameter(); bosNode.setParameter(bosParams[0], bosParams[1], bosParams[2]); - bosNode.isConnectedToBOS = true; // endLists should not contain anything except ArrayLists // it is crucial to have monomorphic dispatch here ArrayList bos = new ArrayList<>(); @@ -94,9 +94,17 @@ public List getNodes(int begin, int end) { } @Override - public Optional getMinimumNode(int begin, int end) { - return endLists.get(end).stream().filter(n -> (n.getBegin() == begin)) - .min(Comparator.comparingInt(l -> l.cost)); + public LatticeNodeImpl getMinimumNode(int begin, int end) { + ArrayList ends = endLists.get(end); + LatticeNodeImpl result = null; + for (LatticeNodeImpl node: ends) { + if (node.begin == begin) { + if (result == null || result.totalCost >= node.cost) { + result = node; + } + } + } + return result; } @Override @@ -143,7 +151,7 @@ void connectNode(LatticeNodeImpl rNode) { // noinspection ForLoopReplaceableByForEach for (int i = 0; i < endNodes.size(); ++i) { LatticeNodeImpl lNode = endNodes.get(i); - if (!lNode.isConnectedToBOS) { + if (!lNode.isConnectedToBOS()) { continue; } @@ -157,7 +165,6 @@ void connectNode(LatticeNodeImpl rNode) { bestPrevNode = lNode; } } - rNode.isConnectedToBOS = (bestPrevNode != null); rNode.totalCost = minLeftCost + rNode.cost; rNode.bestPreviousNode = bestPrevNode; } @@ -166,11 +173,11 @@ void connectEosNode() { connectNode(eosNode); } - List getBestPath() { - if (!eosNode.isConnectedToBOS) { // EOS node + List getBestPath() { + if (!eosNode.isConnectedToBOS()) { // EOS node throw new IllegalStateException("EOS isn't connected to BOS"); } - ArrayList result = new ArrayList<>(); + ArrayList result = new ArrayList<>(); for (LatticeNodeImpl node = eosNode.bestPreviousNode; node != endLists.get(0) .get(0); node = node.bestPreviousNode) { result.add(node); @@ -180,7 +187,7 @@ List getBestPath() { } String getSurface(LatticeNodeImpl node) { - return (node.isDefined) ? node.getWordInfo().getSurface() : "(null)"; + return (node.isDefined) ? node.getBaseSurface() : "(null)"; } String getPos(LatticeNodeImpl node) { diff --git a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java index 96b56109..c06b40ff 100644 --- a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java @@ -18,11 +18,12 @@ import com.worksap.nlp.sudachi.dictionary.Lexicon; import com.worksap.nlp.sudachi.dictionary.WordInfo; +import com.worksap.nlp.sudachi.dictionary.WordParameters; import java.util.List; +import java.util.Objects; public class LatticeNodeImpl implements LatticeNode { - int begin; int end; @@ -34,24 +35,22 @@ public class LatticeNodeImpl implements LatticeNode { int totalCost; LatticeNodeImpl bestPreviousNode; - boolean isConnectedToBOS; boolean isDefined; boolean isOOV; WordInfo extraWordInfo; - Lexicon lexicon; + // this is either Lexicon or StringsCache object + Object lexicon; - static final String NULL_SURFACE = "(null)"; private static final short ZERO = (short) 0; - static final WordInfo UNDEFINED_WORDINFO = new WordInfo(NULL_SURFACE, ZERO, (short) -1, NULL_SURFACE, NULL_SURFACE, - NULL_SURFACE); + static final WordInfo UNDEFINED_WORDINFO = new WordInfo(ZERO, ZERO); - LatticeNodeImpl(Lexicon lexicon, short leftId, short rightId, short cost, int wordId) { + LatticeNodeImpl(Lexicon lexicon, long params, int wordId) { this.lexicon = lexicon; - this.leftId = leftId; - this.rightId = rightId; - this.cost = cost; + this.leftId = WordParameters.leftId(params); + this.rightId = WordParameters.rightId(params); + this.cost = WordParameters.cost(params); this.wordId = wordId; this.isDefined = true; } @@ -67,6 +66,16 @@ public void setParameter(short leftId, short rightId, short cost) { this.cost = cost; } + private Lexicon lexicon() { + if (lexicon instanceof Lexicon) { + return (Lexicon) lexicon; + } else if (lexicon instanceof StringsCache) { + return ((StringsCache)lexicon).lexicon; + } else { + throw new IllegalStateException("lexicon was null probably"); + } + } + @Override public int getBegin() { return begin; @@ -85,7 +94,7 @@ public void setRange(int begin, int end) { @Override public boolean isOOV() { - return isOOV; + return WordId.isOov(wordId); } @Override @@ -101,7 +110,9 @@ public WordInfo getWordInfo() { if (extraWordInfo != null) { return extraWordInfo; } - return lexicon.getWordInfo(wordId); + WordInfo info = lexicon().getWordInfo(wordId); + extraWordInfo = info; + return info; } @Override @@ -128,17 +139,49 @@ public int getDictionaryId() { return WordId.dic(wordId); } + public boolean isConnectedToBOS() { + return bestPreviousNode != null; + } + + public String getBaseSurface() { + return strings().getSurface(this); + } + + public String getReading() { + return strings().getReading(this); + } + + public String getNormalizedForm() { + return strings().getNormalizedForm(this); + } + + public String getDictionaryForm() { + return strings().getDictionaryForm(this); + } + @Override public String toString() { - WordInfo wi = getWordInfo(); - String surface = wi.getSurface(); - short pos = wi.getPOSId(); + String surface = getBaseSurface(); + short pos = getWordInfo().getPOSId(); return String.format("%d %d %s(%d) %d %d %d %d", getBegin(), getEnd(), surface, wordId, pos, leftId, rightId, cost); } - /* internal */ void appendSplitsTo(List result, Tokenizer.SplitMode mode) { + private StringsCache strings() { + Object l = lexicon; + if (l instanceof Lexicon) { + StringsCache c = new StringsCache((Lexicon) l); + lexicon = c; + return c; + } else if (l instanceof StringsCache) { + return (StringsCache) l; + } else { + throw new IllegalStateException("lexicon is not valid, was " + l); + } + } + + /* internal */ void appendSplitsTo(List result, Tokenizer.SplitMode mode) { if (mode == Tokenizer.SplitMode.A) { appendSplitsTo(result, getWordInfo().getAunitSplit()); } else if (mode == Tokenizer.SplitMode.B) { @@ -148,7 +191,7 @@ public String toString() { } } - private void appendSplitsTo(List result, int[] splitsId) { + private void appendSplitsTo(List result, int[] splitsId) { if (splitsId.length == 0) { result.add(this); return; @@ -157,7 +200,7 @@ private void appendSplitsTo(List result, int[] splitsId) { if (wid == getWordId()) { result.add(this); } else { - LatticeNodeImpl node = new LatticeNodeImpl(lexicon, ZERO, ZERO, ZERO, wid); + LatticeNodeImpl node = new LatticeNodeImpl(lexicon(), 0L, wid); node.begin = begin; node.end = end; node.totalCost = totalCost; @@ -167,12 +210,152 @@ private void appendSplitsTo(List result, int[] splitsId) { } int offset = getBegin(); + Lexicon lex = lexicon(); for (int wid : splitsId) { - LatticeNodeImpl n = new LatticeNodeImpl(lexicon, ZERO, ZERO, ZERO, wid); + LatticeNodeImpl n = new LatticeNodeImpl(lex, 0L, wid); n.begin = offset; offset += n.getWordInfo().getLength(); n.end = offset; result.add(n); } } + + private static final class StringsCache { + private final Lexicon lexicon; + private String surface; + private String reading; + private String dictionaryForm; + private String normalizedForm; + + public StringsCache(Lexicon lexicon) { + this.lexicon = lexicon; + } + + public String getSurface(LatticeNodeImpl node) { + // benign data race pattern + // https://shipilev.net/blog/2016/close-encounters-of-jmm-kind/#wishful-benign-is-resilient + String s = surface; + if (s == null) { + WordInfo wi = node.getWordInfo(); + int surfacePtr = wi.getSurface(); + int dic = WordId.dic(node.getWordId()); + s = lexicon.string(dic, surfacePtr); + surface = s; + } + return s; + } + + public String getReading(LatticeNodeImpl node) { + String s = reading; + if (s == null) { + WordInfo wi = node.getWordInfo(); + int readingPtr = wi.getReadingForm(); + int dic = WordId.dic(node.getWordId()); + s = lexicon.string(dic, readingPtr); + reading = s; + } + return s; + } + + public String getDictionaryForm(LatticeNodeImpl node) { + String s = dictionaryForm; + if (s == null) { + WordInfo wi = node.getWordInfo(); + int dicEntryPtr = wi.getDictionaryForm(); + int dic = WordId.blendDic(dicEntryPtr, WordId.dic(node.wordId)); + int surface = lexicon.wordInfos(dic).surfacePtr(dicEntryPtr); + s = lexicon.string(dic, surface); + dictionaryForm = s; + } + return s; + } + + public String getNormalizedForm(LatticeNodeImpl node) { + String s = normalizedForm; + if (s == null) { + WordInfo wi = node.getWordInfo(); + int dicEntryPtr = wi.getNormalizedForm(); + int dic = WordId.blendDic(dicEntryPtr, WordId.dic(node.wordId)); + int surface = lexicon.wordInfos(dic).surfacePtr(dicEntryPtr); + s = lexicon.string(dic, surface); + normalizedForm = s; + } + return s; + } + } + + public static OOVFactory oovFactory(short leftId, short rightId, short cost, short posId) { + return new OOVFactory(leftId, rightId, cost, posId); + } + + public static LatticeNodeImpl makeOov(int begin, int end, short posId, String surface, String normalizedForm, + String dictionaryForm, String readingForm) { + StringsCache c = new StringsCache(null); + c.surface = surface; + c.normalizedForm = normalizedForm; + c.reading = readingForm; + c.dictionaryForm = dictionaryForm; + WordInfo wi = new WordInfo(Short.MIN_VALUE, posId); + LatticeNodeImpl node = new LatticeNodeImpl(); + node.extraWordInfo = wi; + node.lexicon = c; + node.begin = begin; + node.end = end; + return node; + } + + public static final class OOVFactory { + private final short leftId; + private final short rightId; + private final short cost; + private final short posId; + private final WordInfo wordInfo; + + private OOVFactory(short leftId, short rightId, short cost, short posId) { + this.rightId = rightId; + this.cost = cost; + this.leftId = leftId; + this.posId = posId; + this.wordInfo = new WordInfo(ZERO, posId); + } + + public LatticeNodeImpl make(int start, int end, InputText input) { + String s = input.getSubstring(start, end); + return make(start, end, s); + } + + public LatticeNodeImpl make(int start, int end, String text) { + LatticeNodeImpl i = new LatticeNodeImpl(); + i.begin = start; + i.end = end; + i.leftId = leftId; + i.rightId = rightId; + i.cost = cost; + i.wordId = WordId.oovWid(posId); + i.extraWordInfo = wordInfo; + StringsCache sc = new StringsCache(null); + sc.surface = text; + sc.reading = text; + sc.dictionaryForm = text; + sc.normalizedForm = text; + i.lexicon = sc; + return i; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + OOVFactory that = (OOVFactory) o; + return leftId == that.leftId && rightId == that.rightId && cost == that.cost && posId == that.posId && Objects.equals( + wordInfo, that.wordInfo); + } + + @Override + public int hashCode() { + return Objects.hash(leftId, rightId, cost, posId, wordInfo); + } + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/MeCabOovProviderPlugin.java b/src/main/java/com/worksap/nlp/sudachi/MeCabOovProviderPlugin.java index bfe23fcf..7b8dad6a 100644 --- a/src/main/java/com/worksap/nlp/sudachi/MeCabOovProviderPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/MeCabOovProviderPlugin.java @@ -48,7 +48,7 @@ * * {@code charDef} is the file path of the definition of OOV insertion behavior. * {@code unkDef} is the file path of the definition of OOV informations. - * + *

* These files are compatible with MeCab. But the definitions of character * categories in {@code charDef} are ignored and this plugin uses the ones * {@code characterDefinitionFile} in the settings. @@ -62,15 +62,8 @@ static class CategoryInfo { int length; } - static class OOV { - short leftId; - short rightId; - short cost; - short posId; - } - Map categories = new EnumMap<>(CategoryType.class); - Map> oovList = new EnumMap<>(CategoryType.class); + Map> oovList = new EnumMap<>(CategoryType.class); @Override public void setUp(Grammar grammar) throws IOException { @@ -91,14 +84,13 @@ public int provideOOV(InputText inputText, int offset, long otherWords, List oovs = oovList.get(cinfo.type); + List oovs = oovList.get(cinfo.type); if (oovs == null) { continue; } if (cinfo.isGroup && (cinfo.isInvoke || otherWords == 0)) { - String s = inputText.getSubstring(offset, offset + length); - for (OOV oov : oovs) { - nodes.add(getOOVNode(s, oov, length)); + for (LatticeNodeImpl.OOVFactory oov : oovs) { + nodes.add(oov.make(offset, offset + length, inputText)); added += 1; } llength -= 1; @@ -109,9 +101,8 @@ public int provideOOV(InputText inputText, int offset, long otherWords, List llength) { break; } - String s = inputText.getSubstring(offset, offset + sublength); - for (OOV oov : oovs) { - nodes.add(getOOVNode(s, oov, sublength)); + for (LatticeNodeImpl.OOVFactory oov : oovs) { + nodes.add(oov.make(offset, offset + sublength, inputText)); added += 1; } } @@ -121,14 +112,6 @@ public int provideOOV(InputText inputText, int offset, long otherWords, List void readOOV(Config.Resource unkDef, Grammar grammar, String userPosMode) throw new IllegalArgumentException(cols[0] + " is undefined at line " + reader.getLineNumber()); } - OOV oov = new OOV(); - oov.leftId = Short.parseShort(cols[1]); - oov.rightId = Short.parseShort(cols[2]); - oov.cost = Short.parseShort(cols[3]); + short leftId = Short.parseShort(cols[1]); + short rightId = Short.parseShort(cols[2]); + short cost = Short.parseShort(cols[3]); POS pos = new POS(cols[4], cols[5], cols[6], cols[7], cols[8], cols[9]); - oov.posId = posIdOf(grammar, pos, userPosMode); - + short posId = posIdOf(grammar, pos, userPosMode); + LatticeNodeImpl.OOVFactory oov = LatticeNodeImpl.oovFactory(leftId, rightId, cost, posId); oovList.computeIfAbsent(type, t -> new ArrayList<>()).add(oov); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/Morpheme.java b/src/main/java/com/worksap/nlp/sudachi/Morpheme.java index 696ee802..e9ac0d3c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Morpheme.java +++ b/src/main/java/com/worksap/nlp/sudachi/Morpheme.java @@ -16,6 +16,8 @@ package com.worksap.nlp.sudachi; +import com.worksap.nlp.sudachi.dictionary.POS; + import java.util.List; /** @@ -55,7 +57,7 @@ public interface Morpheme { * * @return the part of speech of the morpheme */ - public List partOfSpeech(); + public POS partOfSpeech(); /** * Returns the ID of part of speech of the morpheme. diff --git a/src/main/java/com/worksap/nlp/sudachi/MorphemeImpl.java b/src/main/java/com/worksap/nlp/sudachi/MorphemeImpl.java index 63a65247..fff1b40f 100644 --- a/src/main/java/com/worksap/nlp/sudachi/MorphemeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/MorphemeImpl.java @@ -18,15 +18,15 @@ import java.util.List; +import com.worksap.nlp.sudachi.dictionary.POS; import com.worksap.nlp.sudachi.dictionary.WordInfo; class MorphemeImpl implements Morpheme { + private final MorphemeList list; + private final int index; + private LatticeNodeImpl node; - final MorphemeList list; - final int index; - WordInfo wordInfo; - - MorphemeImpl(MorphemeList list, int index) { + /*internal*/ MorphemeImpl(MorphemeList list, int index) { this.list = list; this.index = index; } @@ -47,7 +47,7 @@ public String surface() { } @Override - public List partOfSpeech() { + public POS partOfSpeech() { WordInfo wi = getWordInfo(); return list.grammar.getPartOfSpeechString(wi.getPOSId()); } @@ -60,20 +60,17 @@ public short partOfSpeechId() { @Override public String dictionaryForm() { - WordInfo wi = getWordInfo(); - return wi.getDictionaryForm(); + return node().getDictionaryForm(); } @Override public String normalizedForm() { - WordInfo wi = getWordInfo(); - return wi.getNormalizedForm(); + return node().getNormalizedForm(); } @Override public String readingForm() { - WordInfo wi = getWordInfo(); - return wi.getReadingForm(); + return node().getReading(); } @Override @@ -83,30 +80,36 @@ public List split(Tokenizer.SplitMode mode) { @Override public boolean isOOV() { - return list.isOOV(index); + return node().isOOV(); } @Override public int getWordId() { - return list.getWordId(index); + return node().getWordId(); } @Override public int getDictionaryId() { - return list.getDictionaryId(index); + return node().getDictionaryId(); } @Override public int[] getSynonymGroupIds() { WordInfo wi = getWordInfo(); - return wi.getSynonymGoupIds(); + return wi.getSynonymGroupIds(); } - WordInfo getWordInfo() { - if (wordInfo == null) { - wordInfo = list.getWordInfo(index); + private LatticeNodeImpl node() { + LatticeNodeImpl n = node; + if (n == null) { + n = list.node(index); + node = n; } - return wordInfo; + return n; + } + + WordInfo getWordInfo() { + return node().getWordInfo(); } @Override @@ -121,4 +124,8 @@ public String toString() { sb.append(")}"); return sb.toString(); } + + /*internal*/ boolean isCompatible(JapaneseDictionary dictionary) { + return dictionary.grammar == this.list.grammar; + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java b/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java index 1422040e..8fdd815c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java +++ b/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java @@ -29,7 +29,7 @@ public class MorphemeList extends AbstractList { final InputText inputText; final Grammar grammar; final Lexicon lexicon; - final List path; + final List path; final boolean allowEmptyMorpheme; final Tokenizer.SplitMode mode; @@ -37,7 +37,7 @@ public class MorphemeList extends AbstractList { public final static MorphemeList EMPTY = new MorphemeList(null, null, null, Collections.emptyList(), true, Tokenizer.SplitMode.C); - MorphemeList(InputText input, Grammar grammar, Lexicon lexicon, List path, boolean allowEmptyMorpheme, + MorphemeList(InputText input, Grammar grammar, Lexicon lexicon, List path, boolean allowEmptyMorpheme, Tokenizer.SplitMode mode) { this.inputText = input; this.grammar = grammar; @@ -90,8 +90,8 @@ WordInfo getWordInfo(int index) { } List split(Tokenizer.SplitMode mode, int index) { - List nodes = new ArrayList<>(); - LatticeNodeImpl node = (LatticeNodeImpl) path.get(index); + List nodes = new ArrayList<>(); + LatticeNodeImpl node = path.get(index); node.appendSplitsTo(nodes, mode); return new MorphemeList(inputText, grammar, lexicon, nodes, allowEmptyMorpheme, mode); } @@ -110,11 +110,10 @@ public MorphemeList split(Tokenizer.SplitMode mode) { return this; } - List nodes = new ArrayList<>(); + List nodes = new ArrayList<>(); - for (LatticeNode node : path) { - LatticeNodeImpl nodeImpl = (LatticeNodeImpl) node; - nodeImpl.appendSplitsTo(nodes, mode); + for (LatticeNodeImpl node : path) { + node.appendSplitsTo(nodes, mode); } return new MorphemeList(inputText, grammar, lexicon, nodes, allowEmptyMorpheme, mode); @@ -133,6 +132,11 @@ int getDictionaryId(int index) { } public int getInternalCost() { - return path.get(path.size() - 1).getPathCost() - path.get(0).getPathCost(); + List p = path; + return p.get(p.size() - 1).getPathCost() - p.get(0).getPathCost(); + } + + /* internal*/ LatticeNodeImpl node(int index) { + return path.get(index); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java b/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java index f4aa460c..35320078 100644 --- a/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java @@ -62,8 +62,6 @@ public void setUp(Grammar grammar) throws IOException { /** * Provides the nodes of OOV morphemes. * - * To create the new node you can use {@link #createNode}. - * * @param inputText * the input text * @param offset @@ -93,14 +91,13 @@ int getOOV(UTF8InputText inputText, int offset, long otherWords, List path, Lattice lattice); + public abstract void rewrite(InputText text, List path, Lattice lattice); /** * Concatenate the sequence of nodes in the path. The sequence begins at the @@ -97,7 +96,7 @@ public void setUp(Grammar grammar) throws IOException { * length of the sequence, or {@code begin} equals or is greater * than {@code end} */ - public LatticeNode concatenate(List path, int begin, int end, Lattice lattice, String normalizedForm) { + public LatticeNode concatenate(List path, int begin, int end, Lattice lattice, String normalizedForm) { if (begin >= end) { throw new IndexOutOfBoundsException("begin >= end"); } @@ -105,27 +104,28 @@ public LatticeNode concatenate(List path, int begin, int end, Latti int e = path.get(end - 1).getEnd(); short posId = path.get(begin).getWordInfo().getPOSId(); StringBuilder surface = new StringBuilder(); - int length = 0; StringBuilder normalizedFormBuilder = new StringBuilder(); StringBuilder dictionaryForm = new StringBuilder(); StringBuilder readingForm = new StringBuilder(); for (int i = begin; i < end; i++) { WordInfo info = path.get(i).getWordInfo(); surface.append(info.getSurface()); - length += info.getLength(); if (normalizedForm == null) { normalizedFormBuilder.append(info.getNormalizedForm()); } dictionaryForm.append(info.getDictionaryForm()); readingForm.append(info.getReadingForm()); } - WordInfo wi = new WordInfo(surface.toString(), (short) length, posId, - (normalizedForm == null) ? normalizedFormBuilder.toString() : normalizedForm, dictionaryForm.toString(), - readingForm.toString()); - LatticeNode node = lattice.createNode(); - node.setRange(b, e); - node.setWordInfo(wi); + String s = surface.toString(); + LatticeNodeImpl node = LatticeNodeImpl.makeOov( + b, e, + posId, + s, + (normalizedForm == null) ? normalizedFormBuilder.toString() : normalizedForm, + dictionaryForm.toString(), + readingForm.toString() + ); replaceNode(path, begin, end, node); return node; } @@ -143,8 +143,8 @@ public LatticeNode concatenate(List path, int begin, int end, Latti * the beginning index * @param end * the ending index - * @param posId - * the POS ID of the concatenated node + * @param factory + * factory for creating an OOV lattice node * @param lattice * the lattice * @return the concatenated OOV node @@ -153,34 +153,27 @@ public LatticeNode concatenate(List path, int begin, int end, Latti * length of the sequence, or {@code begin} equals or is greater * than {@code end} */ - public LatticeNode concatenateOov(List path, int begin, int end, short posId, Lattice lattice) { + public LatticeNode concatenateOov(List path, int begin, int end, LatticeNodeImpl.OOVFactory factory, Lattice lattice) { if (begin >= end) { throw new IndexOutOfBoundsException("begin >= end"); } int b = path.get(begin).getBegin(); int e = path.get(end - 1).getEnd(); - Optional n = lattice.getMinimumNode(b, e); - if (n.isPresent()) { - LatticeNode node = n.get(); + LatticeNodeImpl node = lattice.getMinimumNode(b, e); + if (node != null) { replaceNode(path, begin, end, node); return node; } StringBuilder surface = new StringBuilder(); - int length = 0; for (int i = begin; i < end; i++) { - WordInfo info = path.get(i).getWordInfo(); - surface.append(info.getSurface()); - length += info.getLength(); + String s = path.get(i).getBaseSurface(); + surface.append(s); } - String s = surface.toString(); - WordInfo wi = new WordInfo(s, (short) length, posId, s, s, ""); - LatticeNode node = lattice.createNode(); - node.setRange(b, e); - node.setWordInfo(wi); - node.setOOV(); + String s = surface.toString(); + node = factory.make(b, e, s); replaceNode(path, begin, end, node); return node; } @@ -198,7 +191,7 @@ public Set getCharCategoryTypes(InputText text, LatticeNode node) return text.getCharCategoryTypes(node.getBegin(), node.getEnd()); } - private void replaceNode(List path, int begin, int end, LatticeNode node) { + private void replaceNode(List path, int begin, int end, LatticeNodeImpl node) { path.subList(begin, end).clear(); path.add(begin, node); } diff --git a/src/main/java/com/worksap/nlp/sudachi/PosMatcher.java b/src/main/java/com/worksap/nlp/sudachi/PosMatcher.java index f174dd4b..a6a18202 100644 --- a/src/main/java/com/worksap/nlp/sudachi/PosMatcher.java +++ b/src/main/java/com/worksap/nlp/sudachi/PosMatcher.java @@ -121,7 +121,7 @@ private void checkCompatibility(PosMatcher other) { */ @Override public boolean test(Morpheme morpheme) { - assert ((MorphemeImpl) morpheme).list.grammar == dictionary.grammar; + assert ((MorphemeImpl) morpheme).isCompatible(dictionary); return matching.get(morpheme.partOfSpeechId()); } diff --git a/src/main/java/com/worksap/nlp/sudachi/RegexOovProvider.java b/src/main/java/com/worksap/nlp/sudachi/RegexOovProvider.java index 6b6106e7..df654a0e 100644 --- a/src/main/java/com/worksap/nlp/sudachi/RegexOovProvider.java +++ b/src/main/java/com/worksap/nlp/sudachi/RegexOovProvider.java @@ -58,12 +58,9 @@ */ public class RegexOovProvider extends OovProviderPlugin { private Pattern pattern; - private short posId = -1; - private short cost = Short.MIN_VALUE; - private short leftId = Short.MIN_VALUE; - private short rightId = Short.MIN_VALUE; private int maxLength = 32; private boolean strictBoundaries = true; + private LatticeNodeImpl.OOVFactory factory; @Override public void setUp(Grammar grammar) throws IOException { @@ -74,16 +71,17 @@ public void setUp(Grammar grammar) throws IOException { } POS stringPos = new POS(oovPOS); String userPosMode = settings.getString(USER_POS, USER_POS_FORBID); - posId = posIdOf(grammar, stringPos, userPosMode); + short posId = posIdOf(grammar, stringPos, userPosMode); if (posId == -1) { throw new IllegalArgumentException("POS " + stringPos + " was not present in the dictionary"); } - cost = checkedShort(settings, "cost"); - leftId = checkedShort(settings, "leftId"); - rightId = checkedShort(settings, "rightId"); + short cost = checkedShort(settings, "cost"); + short leftId = checkedShort(settings, "leftId"); + short rightId = checkedShort(settings, "rightId"); pattern = checkPattern(settings.getString("regex")); maxLength = settings.getInt("maxLength", 32); strictBoundaries = isStrictContinuity(settings); + factory = LatticeNodeImpl.oovFactory(leftId, rightId, cost, posId); } @Override @@ -124,11 +122,7 @@ public int provideOOV(InputText inputText, int offset, long otherWords, List nodes) { if (otherWords == 0) { - LatticeNodeImpl node = createNode(); - node.setParameter(leftId, rightId, cost); int length = inputText.getWordCandidateLength(offset); - String s = inputText.getSubstring(offset, offset + length); - WordInfo info = new WordInfo(s, (short) length, oovPOSId, s, s, ""); - node.setWordInfo(info); + LatticeNodeImpl node = factory.make(offset, offset + length, inputText); nodes.add(node); return 1; } else { diff --git a/src/main/java/com/worksap/nlp/sudachi/WordId.java b/src/main/java/com/worksap/nlp/sudachi/WordId.java index 8cc8ffc2..e2d41543 100644 --- a/src/main/java/com/worksap/nlp/sudachi/WordId.java +++ b/src/main/java/com/worksap/nlp/sudachi/WordId.java @@ -20,6 +20,9 @@ public class WordId { private WordId() { } + public static final int ID_BOS = 0xffff_fff0; + public static final int ID_EOS = 0xffff_fff1; + /** * Internal word ids can't be larger than this number */ @@ -77,6 +80,11 @@ public static int word(int wordId) { return wordId & MAX_WORD_ID; } + public static int blendDic(int rawWordId, int actualDicId) { + int flag = dic(rawWordId); + return flag * actualDicId; + } + public static int dicIdMask(int dicId) { return dicId << 28; } @@ -84,4 +92,17 @@ public static int dicIdMask(int dicId) { public static int applyMask(int wordId, int dicIdMask) { return (wordId & MAX_WORD_ID) | dicIdMask; } + + public static boolean isOov(int wordId) { + // low 16 bits are OOV POS, top 4 are 1s + return (wordId & 0xffff_0000) == 0xf000_0000; + } + public static boolean isSpecial(int wordId) { + // top 5 bits should be filled + return (wordId & 0xf800_0000) == 0xf800_0000; + } + + public static int oovWid(short posId) { + return 0xf000_0000 | posId; + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/BinaryDictionary.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/BinaryDictionary.java index 0f964816..bbd88ce1 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/BinaryDictionary.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/BinaryDictionary.java @@ -19,6 +19,8 @@ import java.io.Closeable; import java.io.IOException; import java.nio.ByteBuffer; +import java.nio.file.Path; +import java.nio.file.Paths; import com.worksap.nlp.sudachi.Config; import com.worksap.nlp.sudachi.MMap; @@ -26,33 +28,24 @@ public class BinaryDictionary implements Closeable, DictionaryAccess { private final ByteBuffer bytes; - private final DictionaryHeader header; + private final Description header; private final GrammarImpl grammar; private final DoubleArrayLexicon lexicon; public BinaryDictionary(String fileName) throws IOException { - this(MMap.map(fileName)); + this(Paths.get(fileName)); + } + + public BinaryDictionary(Path filename) throws IOException { + this(MMap.map(filename)); } public BinaryDictionary(ByteBuffer dictionary) throws IOException { - int offset = 0; bytes = dictionary; - header = new DictionaryHeader(bytes, offset); - offset += header.storageSize(); - - long version = header.getVersion(); - if (DictionaryVersion.hasGrammar(version)) { - grammar = new GrammarImpl(bytes, offset); - offset += grammar.storageSize(); - } else if (header.isUserDictionary()) { - grammar = new GrammarImpl(); - } else { - MMap.unmap(bytes); - throw new IOException("invalid dictionary"); - } - - lexicon = new DoubleArrayLexicon(bytes, offset, DictionaryVersion.hasSynonymGroupIds(version)); + header = Description.load(dictionary); + grammar = GrammarImpl.load(bytes, header); + lexicon = DoubleArrayLexicon.load(bytes, header); } public static BinaryDictionary loadSystem(String fileName) throws IOException { @@ -94,7 +87,7 @@ public void close() throws IOException { MMap.unmap(bytes); } - public DictionaryHeader getDictionaryHeader() { + public Description getDictionaryHeader() { return header; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/BufReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/BufReader.java index dc1006d9..edabfb4e 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/BufReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/BufReader.java @@ -77,7 +77,7 @@ private static long readVarLongSlowpath(long v0, ByteBuffer b) { } v0 |= (v7 & 0x7f) << 49; long v8 = b.get() & 0xff; - if (v8 < 0x80) { // only 6 bits are valid here, rest must be 0 + if (v8 < 0x80) { return (v8 << 56) | v0; } v0 |= (v8 & 0x7f) << 56; @@ -114,4 +114,8 @@ public String readUtf8String() { return new String(repr, StandardCharsets.UTF_8); } } + + public int remaining() { + return buffer.remaining(); + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/CompactedStrings.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/CompactedStrings.java new file mode 100644 index 00000000..e7cd70ce --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/CompactedStrings.java @@ -0,0 +1,23 @@ +package com.worksap.nlp.sudachi.dictionary; + +import java.nio.CharBuffer; + +public class CompactedStrings { + private final CharBuffer chars; + + public CompactedStrings(CharBuffer chars) { + this.chars = chars; + } + + public CharSequence sequence(int pointer) { + CharBuffer dup = chars.duplicate(); + StringPtr ptr = StringPtr.decode(pointer); + dup.position(ptr.getOffset()); + dup.limit(ptr.getOffset() + ptr.getLength()); + return dup; + } + + public String string(int pointer) { + return sequence(pointer).toString(); + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Connection.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Connection.java index 5a56542f..277db838 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Connection.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Connection.java @@ -16,6 +16,8 @@ package com.worksap.nlp.sudachi.dictionary; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.nio.ShortBuffer; /** @@ -83,4 +85,15 @@ public void validate(int leftId) { throw new IllegalArgumentException(String.format("leftId < leftSize: (%d, %d)", leftId, leftSize)); } } + + public static Connection fromByteBufferV1(ByteBuffer raw) { + short numLeft = raw.getShort(); + short numRight = raw.getShort(); + ByteBuffer dup = raw.duplicate(); + dup.order(ByteOrder.LITTLE_ENDIAN); + dup.position(raw.position()); + dup.limit(raw.position() + numLeft * numRight * 2); + ShortBuffer data = dup.asShortBuffer(); + return new Connection(data, numLeft, numRight); + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java index b0e07c95..100abbc2 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java @@ -24,27 +24,102 @@ import java.nio.channels.SeekableByteChannel; import java.nio.charset.StandardCharsets; import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneId; import java.time.format.DateTimeFormatter; import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.Random; +/** + * Description of the dictionary blocks, in-memory representation. + * Basically, an extended version of the dictionary header. + */ public class Description { private Instant creationTime = Instant.now(); + private String comment = ""; + private String signature = defaultSignature(creationTime); + private String reference = ""; + private List blocks = new ArrayList<>(); + private long flags; + private int numTotalEntries; + private int numIndexedEntries; - private String comment; + /** + * Return a slice of the full dictionary with the provided name + * @param full ByteBuffer which represents the whole dictionary loaded into memory + * @param part name of the required part + * @return slice of the ByteBuffer + * @throws IllegalArgumentException if the part with the provided name was not found + */ + public ByteBuffer slice(ByteBuffer full, String part) { + ByteBuffer slice = sliceOrNull(full, part); + if (slice == null) { + throw new IllegalArgumentException("Dictionary did not contain part with name=" + part); + } + return slice; + } - private String signature = defaultSignature(creationTime); + /** + * Return a slice of the full dictionary with the provided name + * @param full ByteBuffer which represents the whole dictionary loaded into memory + * @param part name of the required part + * @return slice of the ByteBuffer or null if not found + */ + public ByteBuffer sliceOrNull(ByteBuffer full, String part) { + for (Block b: blocks) { + if (b.name.equals(part)) { + int start = (int)b.start; + int end = (int)(b.start + b.size); + int position = full.position(); + int limit = full.limit(); + full.position(start); + full.limit(end); + ByteBuffer slice = full.slice(); + full.position(position); + full.limit(limit); + slice.order(ByteOrder.LITTLE_ENDIAN); + return slice; + } + } + return null; + } - private String reference; + public boolean isSystemDictionary() { + return reference.isEmpty(); + } - private List blocks = new ArrayList<>(); + public boolean isUserDictionary() { + return !reference.isEmpty(); + } + + public long getNumTotalEntries() { + return numTotalEntries; + } public static class Block { - private String name; - private long start; - private long size; + private final String name; + private final long start; + private final long size; + + public Block(String name, long start, long size) { + this.name = name; + this.start = start; + this.size = size; + } + + public String getName() { + return name; + } + + public long getStart() { + return start; + } + + public long getSize() { + return size; + } } public static Description load(SeekableByteChannel channel) throws IOException { @@ -72,15 +147,15 @@ private static Description loadV1(ByteBuffer raw) { Description desc = new Description(); BufReader reader = new BufReader(raw); desc.creationTime = Instant.ofEpochSecond(reader.readLong()); + desc.flags = reader.readLong(); desc.comment = reader.readUtf8String(); desc.signature = reader.readUtf8String(); desc.reference = reader.readUtf8String(); + desc.numIndexedEntries = reader.readVarint32(); + desc.numTotalEntries = reader.readVarint32(); int length = reader.readVarint32(); for (int i = 0; i < length; ++i) { - Block b = new Block(); - b.name = reader.readUtf8String(); - b.start = reader.readVarint64(); - b.size = reader.readVarint64(); + Block b = new Block(reader.readUtf8String(), reader.readVarint64(), reader.readVarint64()); desc.blocks.add(b); } @@ -89,13 +164,17 @@ private static Description loadV1(ByteBuffer raw) { public void save(SeekableByteChannel channel) throws IOException { ByteBuffer buff = ByteBuffer.allocate(4096); + buff.order(ByteOrder.LITTLE_ENDIAN); buff.put(MAGIC_BYTES); BufWriter writer = new BufWriter(buff); writer.putLong(1); // version - writer.putVarint64(creationTime.getEpochSecond()); + writer.putLong(creationTime.getEpochSecond()); + writer.putLong(flags); writer.putStringUtf8(comment); writer.putStringUtf8(signature); writer.putStringUtf8(reference); + writer.putVarint32(numIndexedEntries); + writer.putVarint32(numTotalEntries); int length = blocks.size(); writer.putVarint32(length); for (Block b : blocks) { @@ -106,7 +185,7 @@ public void save(SeekableByteChannel channel) throws IOException { long pos = channel.position(); channel.position(0); - buff.reset(); + buff.flip(); channel.write(buff); channel.position(pos); } @@ -135,18 +214,21 @@ private static void checkLegacyDictionaryFormat(ByteBuffer raw) { } private String defaultSignature(Instant date) { - DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMMddHHmmss", Locale.ROOT); - return String.format("%s-%08x", formatter.format(date), new Random().nextLong()); + DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMMddHHmmss", Locale.US); + return String.format("%s-%08x", formatter.format(LocalDateTime.ofInstant(date, ZoneId.systemDefault())), new Random().nextLong()); } public Instant getCreationTime() { return creationTime; } - public void setCreationTime(Instant creationTime) { + public void setCompilationTime(Instant creationTime) { this.creationTime = creationTime; } + @Deprecated + public String getDescription() { return getComment(); } + public String getComment() { return comment; } @@ -178,4 +260,18 @@ public List getBlocks() { public void setBlocks(List blocks) { this.blocks = blocks; } + + public void setRuntimeCosts(boolean val) { + long x = val ? 1 : 0; + flags = (flags & ~0x1L) | x; + } + + public boolean isRuntimeCosts() { + return (flags & 0x1L) != 0; + } + + public void setNumberOfEntries(int indexed, int total) { + this.numIndexedEntries = indexed; + this.numTotalEntries = total; + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilder.java index 551e3d47..fbfdda82 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilder.java @@ -66,6 +66,7 @@ public static void main(String[] args) throws IOException { String description = ""; String outputPath = null; String matrixPath = null; + String signature = null; int i; for (i = 0; i < args.length; i++) { @@ -75,6 +76,8 @@ public static void main(String[] args) throws IOException { matrixPath = args[++i]; } else if (args[i].equals("-d") && i + 1 < args.length) { description = args[++i]; + } else if (args[i].equals("-s")) { + signature = args[++i]; } else if (args[i].equals("-h")) { printUsage(); return; @@ -90,9 +93,13 @@ public static void main(String[] args) throws IOException { List lexiconPaths = Arrays.asList(args).subList(i, args.length); - DicBuilder.System builder = DicBuilder.system().matrix(Paths.get(matrixPath)).description(description) + DicBuilder.System builder = DicBuilder.system().matrix(Paths.get(matrixPath)).comment(description) .progress(new Progress(20, new StderrProgress())); + if (signature != null) { + builder.signature(signature); + } + for (String lexiconPath : lexiconPaths) { builder = builder.lexicon(Paths.get(lexiconPath)); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index ae4871ca..172a8842 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -16,61 +16,179 @@ package com.worksap.nlp.sudachi.dictionary; +import com.worksap.nlp.sudachi.dictionary.build.RawLexiconReader.Column; + import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; import java.util.Arrays; +import java.util.Iterator; import java.util.List; import java.util.stream.Collectors; public class DictionaryPrinter { - private DictionaryPrinter() { + private final PrintStream output; + private final BinaryDictionary dic; + private final BinaryDictionary base; + + private final GrammarImpl grammar; + private final DoubleArrayLexicon lex; + private final Ints wordIds; + + private DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictionary base) { + this.output = output; + this.dic = dic; + this.base = base; + + if (base != null) { + GrammarImpl grammar = base.getGrammar(); + grammar.addPosList(dic.getGrammar()); + this.grammar = grammar; + } else { + grammar = dic.getGrammar(); + } + + lex = dic.getLexicon(); + + // in order to output dictionary entries in in-dictionary order we need to sort them + // iterator over them will get them not in the sorted order, but grouped by surface (and sorted in groups) + Ints allIds = new Ints(lex.size()); + Iterator ids = lex.wordIds(); + while (ids.hasNext()) { + allIds.appendAll(ids.next()); + } + allIds.sort(); + wordIds = allIds; } - static void printDictionary(String filename, BinaryDictionary systemDict, PrintStream output) throws IOException { - GrammarImpl grammar = null; + void printHeader() { + // @formatter:off + printColumnHeaders(Column.Surface, Column.LeftId, Column.RightId, Column.Cost, Column.Pos1, Column.Pos2, + Column.Pos3, Column.Pos4, Column.Pos5, Column.Pos6, Column.ReadingForm, Column.DictionaryForm, + Column.NormalizedForm, Column.Mode, Column.SplitA, Column.SplitB, Column.SplitC, Column.WordStructure, + Column.SynonymGroups, Column.UserData); + // @formatter:on + } - try (BinaryDictionary dictionary = new BinaryDictionary(filename)) { - if (dictionary.getDictionaryHeader().isSystemDictionary()) { - grammar = dictionary.getGrammar(); - } else if (systemDict == null) { - throw new IllegalArgumentException("the system dictionary is not specified"); - } else { - grammar = systemDict.getGrammar(); - if (DictionaryVersion.hasGrammar(dictionary.getDictionaryHeader().getVersion())) { - grammar.addPosList(dictionary.getGrammar()); - } - } + void printColumnHeaders(Column... headers) { + for (Column c : headers) { + output.print(c.name()); + } + output.println(); + } - List posStrings = new ArrayList<>(); - for (short pid = 0; pid < grammar.getPartOfSpeechSize(); pid++) { - posStrings.add(String.join(",", grammar.getPartOfSpeechString(pid))); - } + void printEntry(int wordId) { + WordInfo info = lex.getWordInfo(wordId); + POS pos = grammar.getPartOfSpeechString(info.getPOSId()); + long params = lex.parameters(wordId); + short leftId = WordParameters.leftId(params); + short rightId = WordParameters.rightId(params); + short cost = WordParameters.cost(params); + String surface = lex.string(0, info.getSurface()); + String reading = lex.string(0, info.getReadingForm()); + field(surface); + field(leftId); + field(rightId); + field(cost); + field(pos.get(0)); + field(pos.get(1)); + field(pos.get(2)); + field(pos.get(3)); + field(pos.get(4)); + field(pos.get(5)); + field(reading); + entryPtr(info.getNormalizedForm(), ","); + entryPtr(info.getDictionaryForm(), ","); + output.print("\n"); + } + + void entryPtr(int wordId, String delimiter) { + WordInfo info = lex.getWordInfo(wordId); + POS pos = grammar.getPartOfSpeechString(info.getPOSId()); + String surface = lex.string(0, info.getSurface()); + String reading = lex.string(0, info.getReadingForm()); + ptrPart(surface, "-"); + ptrPart(pos.get(0), "-"); + ptrPart(pos.get(1), "-"); + ptrPart(pos.get(2), "-"); + ptrPart(pos.get(3), "-"); + ptrPart(pos.get(4), "-"); + ptrPart(pos.get(5), "-"); + ptrPart(reading, ""); + output.print(delimiter); + } + + void ptrPart(String part, String delimiter) { + output.print(part); + output.print(delimiter); + } - Lexicon lexicon = dictionary.getLexicon(); - for (int wordId = 0; wordId < lexicon.size(); wordId++) { - short leftId = lexicon.getLeftId(wordId); - short rightId = lexicon.getRightId(wordId); - short cost = lexicon.getCost(wordId); - WordInfo wordInfo = lexicon.getWordInfo(wordId); - - char unitType = getUnitType(wordInfo); - - output.println(String.format("%s,%d,%d,%d,%s,%s,%s,%s,%s,%c,%s,%s,%s", wordInfo.getSurface(), leftId, - rightId, cost, wordInfo.getSurface(), posStrings.get(wordInfo.getPOSId()), - wordInfo.getReadingForm(), wordInfo.getNormalizedForm(), - wordIdToString(wordInfo.getDictionaryFormWordId()), unitType, - splitToString(wordInfo.getAunitSplit()), splitToString(wordInfo.getBunitSplit()), - splitToString(wordInfo.getWordStructure()))); + void field(short value) { + output.print(value); + output.print(','); + } + + void field(String value) { + output.print(maybeQuoteField(value)); + output.print(','); + } + + private String maybeQuoteField(String value) { + boolean hasCommas = value.indexOf(',') != -1; + boolean hasQuotes = value.indexOf('"') != -1; + if (hasCommas || hasQuotes) { + return escape(value, hasQuotes); + } + return value; + } + + private String maybeQuoteRefPart(String value) { + if (value.indexOf(',') != -1 || value.indexOf('"') != -1 || value.indexOf('-') != -1 || value.indexOf( + '/') != -1) { + return fullEscape(value); + } + return value; + } + + private String escape(String value, boolean hasQuotes) { + if (hasQuotes) { + return fullEscape(value); + } + // only commas + return "\"" + value + "\""; + } + + private String fullEscape(String value) { + StringBuilder sb = new StringBuilder(value.length() + 10); + int len = value.length(); + for (int i = 0; i < len; ++i) { + char c = value.charAt(i); + if (c != '"' && c != '-' && c != ',' && c != '/') { + sb.append(c); + } else { + sb.append("\\u{").append(Integer.toHexString(c)).append('}'); } } + return sb.toString(); } - static String wordIdToString(int wid) { - return (wid < 0) ? "*" : Integer.toString(wid); + private void printEntries() { + for (int i = 0; i < wordIds.length(); ++i) { + printEntry(wordIds.get(i)); + } } + static void printDictionary(String filename, BinaryDictionary systemDict, PrintStream output) throws IOException { + try (BinaryDictionary dictionary = new BinaryDictionary(filename)) { + DictionaryPrinter dp = new DictionaryPrinter(output, dictionary, systemDict); + dp.printHeader(); + dp.printEntries(); + } + } + + + static char getUnitType(WordInfo info) { if (info.getAunitSplit().length == 0) { return 'A'; @@ -104,11 +222,11 @@ static String splitToString(int[] split) { * *

* This tool requires the system dictionary when it dumps an user dictionary. - * + * * @param args - * the option and the input filename + * the option and the input filename * @throws IOException - * if IO + * if IO */ public static void main(String[] args) throws IOException { BinaryDictionary systemDict = null; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java index 121596e7..720c6d5e 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java @@ -16,7 +16,6 @@ package com.worksap.nlp.sudachi.dictionary; -import java.nio.Buffer; import java.nio.ByteBuffer; import java.nio.IntBuffer; import java.util.Iterator; @@ -26,30 +25,43 @@ import com.worksap.nlp.sudachi.Tokenizer; public class DoubleArrayLexicon implements Lexicon { - static final int USER_DICT_COST_PAR_MORPH = -20; - - private final WordIdTable wordIdTable; - private final WordParameterList wordParams; private final WordInfoList wordInfos; private final DoubleArray trie; + private final WordParameters parameters; + private final Description description; + private final WordIdTable wordIdTable; + private final CompactedStrings strings; - public DoubleArrayLexicon(ByteBuffer bytes, int offset, boolean hasSynonymGid) { - trie = new DoubleArray(); - int size = bytes.getInt(offset); - offset += 4; - ((Buffer) bytes).position(offset); // a kludge for Java 9 - IntBuffer array = bytes.asIntBuffer(); - trie.setArray(array, size); - offset += trie.totalSize(); - wordIdTable = new WordIdTable(bytes, offset); - offset += wordIdTable.storageSize(); + public DoubleArrayLexicon(Description description, WordIdTable wordIdTable, WordParameters wordParams, WordInfoList wordInfos, + DoubleArray trie, CompactedStrings strings) { + this.description = description; + this.wordIdTable = wordIdTable; + this.parameters = wordParams; + this.wordInfos = wordInfos; + this.trie = trie; + this.strings = strings; + } + + public static DoubleArrayLexicon load(ByteBuffer bytes, Description header) { + ByteBuffer trieBuf = header.slice(bytes, Blocks.TRIE_INDEX); + DoubleArray da = new DoubleArray(); + IntBuffer array = trieBuf.asIntBuffer(); + da.setArray(array, array.limit()); + + WordParameters parms; + if (header.isRuntimeCosts()) { + parms = WordParameters.readWrite(bytes, header); + } else { + parms = WordParameters.readOnly(bytes, header); + } - wordParams = new WordParameterList(bytes, offset); - offset += wordParams.storageSize(); + WordIdTable idTable = new WordIdTable(header.slice(bytes, Blocks.WORD_POINTERS)); + WordInfoList infos = new WordInfoList(header.slice(bytes, Blocks.ENTRIES)); + CompactedStrings strings = new CompactedStrings(header.slice(bytes, Blocks.STRINGS).asCharBuffer()); - wordInfos = new WordInfoList(bytes, offset, wordParams.size(), hasSynonymGid); + return new DoubleArrayLexicon(header, idTable, parms, infos, da, strings); } /** @@ -85,6 +97,11 @@ public WordIdTable getWordIdTable() { return wordIdTable; } + @Override + public long parameters(int wordId) { + return parameters.loadParams(wordId); + } + private class Itr implements Iterator { private final Iterator iterator; private Integer[] wordIds; @@ -118,56 +135,55 @@ public int[] next() { } @Override - public int getWordId(String headword, short posId, String readingForm) { - for (int wid = 0; wid < wordInfos.size(); wid++) { - WordInfo info = wordInfos.getWordInfo(wid); - if (info.getSurface().equals(headword) && info.getPOSId() == posId - && info.getReadingForm().equals(readingForm)) { - return wid; - } - } - return -1; - } - - @Override - public short getLeftId(int wordId) { - return wordParams.getLeftId(wordId); + public String string(int dic, int stringPtr) { + return strings.string(stringPtr); } @Override - public short getRightId(int wordId) { - return wordParams.getRightId(wordId); + public WordInfo getWordInfo(int wordId) { + return wordInfos.getWordInfo(wordId); } @Override - public short getCost(int wordId) { - return wordParams.getCost(wordId); + public int size() { + return (int) description.getNumTotalEntries(); } - @Override - public WordInfo getWordInfo(int wordId) { - return wordInfos.getWordInfo(wordId); + public Iterator wordIds() { + return wordIdTable.wordIds(); } - @Override - public int size() { - return wordParams.size(); + /** + * Returns true if the cost value is a normal value which can be used as is. + * Otherwise, it is a placeholder which needs to be recalculated + * based on the content of the dictionary. + * @param cost raw cost value + * @return true a normal cost value + */ + public static boolean isNormalCost(short cost) { + return cost != Short.MIN_VALUE; } - public void calculateCost(Tokenizer tokenizer) { - for (int wordId = 0; wordId < wordParams.size(); wordId++) { - if (getCost(wordId) != Short.MIN_VALUE) { - continue; + public void calculateDynamicCosts(Tokenizer tokenizer) { + Iterator outer = wordIdTable.wordIds(); + while (outer.hasNext()) { + Ints values = outer.next(); + for (int i = 0; i < values.length(); ++i) { + int wordId = values.get(i); + if (isNormalCost(WordParameters.cost(parameters(wordId)))) { + continue; + } + int surfPtr = wordInfos.surfacePtr(wordId); + String surface = strings.string(surfPtr); + MorphemeList ms = tokenizer.tokenize(surface); + int cost = ms.getInternalCost() + USER_DICT_COST_PAR_MORPH * ms.size(); + if (cost > Short.MAX_VALUE) { + cost = Short.MAX_VALUE; + } else if (cost < Short.MIN_VALUE) { + cost = Short.MIN_VALUE; + } + parameters.setCost(wordId, (short) cost); } - String surface = getWordInfo(wordId).getSurface(); - MorphemeList ms = tokenizer.tokenize(surface); - int cost = ms.getInternalCost() + USER_DICT_COST_PAR_MORPH * ms.size(); - if (cost > Short.MAX_VALUE) { - cost = Short.MAX_VALUE; - } else if (cost < Short.MIN_VALUE) { - cost = Short.MIN_VALUE; - } - wordParams.setCost(wordId, (short) cost); } } @@ -175,4 +191,8 @@ public void setDictionaryId(int id) { wordIdTable.setDictionaryId(id); } + @Override + public WordInfoList wordInfos(int dic) { + return wordInfos; + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/GrammarImpl.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/GrammarImpl.java index f26caba3..93af0a47 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/GrammarImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/GrammarImpl.java @@ -70,6 +70,11 @@ public GrammarImpl() { originalPosSize = 0; } + public static GrammarImpl load(ByteBuffer binaryDic, Description header) { + Connection matrix = Connection.fromByteBufferV1(header.slice(binaryDic, Blocks.CONNECTION_MATRIX)); + return new GrammarImpl(); + } + public int storageSize() { return storageSize; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java index 8c6123a7..9f498353 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java @@ -20,6 +20,10 @@ import java.util.Arrays; import java.util.StringJoiner; +/** + * Internal class for dealing with resizable integer arrays without boxing or double indirection. + * This class is not a part of Sudachi API and can be changed. + */ public class Ints { private int[] data; private int length; @@ -60,11 +64,14 @@ public void clear() { length = 0; } - public void maybeResize(int additional) { + private int[] maybeResize(int additional) { int newSize = length + additional; - if (newSize > data.length) { - data = Arrays.copyOf(data, Math.max(newSize, length * 2)); + int[] d = data; + if (newSize > d.length) { + d = Arrays.copyOf(data, Math.max(newSize, length * 2)); + data = d; } + return d; } public static Ints wrap(int[] array, int size) { @@ -75,15 +82,24 @@ public static Ints wrap(int[] array) { return new Ints(array, array.length); } - private static final int[] EMPTY_ARRAY = new int[0]; + public static final int[] EMPTY_ARRAY = new int[0]; public static int[] readArray(ByteBuffer buffer, int len) { + if (len == 0) { + return EMPTY_ARRAY; + } + int position = buffer.position(); + buffer.position(position + len * 4); + return readArray(buffer, position, len); + } + + public static int[] readArray(ByteBuffer buffer, int offset, int len) { if (len == 0) { return EMPTY_ARRAY; } int[] result = new int[len]; for (int i = 0; i < len; ++i) { - result[i] = buffer.getInt(); + result[i] = buffer.getInt(offset + i * 4); } return result; } @@ -96,4 +112,22 @@ public String toString() { } return joiner.toString(); } + + public int[] prepare(int size) { + return maybeResize(length - size); + } + + public void appendAll(Ints other) { + int addedLength = other.length; + int[] write = maybeResize(addedLength); + int start = length; + if (addedLength >= 0) { + System.arraycopy(other.data, 0, write, start, addedLength); + } + length += addedLength; + } + + public void sort() { + Arrays.sort(data, 0, length); + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java index 33c8801f..2a7bccd2 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java @@ -27,77 +27,42 @@ public interface Lexicon { Iterator lookup(byte[] text, int offset); - int getWordId(String headword, short posId, String readingForm); - - /** - * Returns the left-ID of the morpheme specified by the word ID. - * - *

- * when the word ID is out of range, the behavior is undefined. - * - * @param wordId - * the word ID of the morpheme - * @return the left-ID of the morpheme - */ - short getLeftId(int wordId); - - /** - * Returns the right-ID of the morpheme specified by the word ID. - * - *

- * when the word ID is out of range, the behavior is undefined. - * - * @param wordId - * the word ID of the morpheme - * @return the right-ID of the morpheme. - */ - short getRightId(int wordId); - /** - * Returns the word occurrence cost of the morpheme specified by the word ID. - * - *

- * when the word ID is out of range, the behavior is undefined. - * - * @param wordId - * the word ID of the morpheme - * @return the word occurrence cost + * Return packed parameters for the morpheme with the given id. + * Parameters are leftId, rightId, cost packed in a single long value. + * @param wordId id of word to extract parameters + * @return long value of packed parameters */ - short getCost(int wordId); + long parameters(int wordId); /** - * Returns the informations of the morpheme specified by the word ID. + * Returns the on-disk information of the morpheme specified by the word ID. * *

* when the word ID is out of range, the behavior is undefined. * * @param wordId * the word ID of the morpheme - * @return the informations of the morpheme + * @return on-disk information for the morpheme with the given id * @see WordInfo */ WordInfo getWordInfo(int wordId); - /** - * Returns the ID of the dictionary containing the morpheme specified by the - * word ID. - * - * If the morpheme is in the system dictionary, it returns {@code 0}. - * - * @param wordId - * the word ID of the morpheme - * @return the dictionary ID - * @deprecated use {@link WordId#dic(int)} - */ - @Deprecated - default int getDictionaryId(int wordId) { - return WordId.dic(wordId); - } - /** * Returns the number of morphemes in the dictionary. * * @return the number of morphemes */ int size(); + + /** + * Get the string with the given packed string pointer from the dictionary + * @param dic dictionary id + * @param stringPtr packed string pointer + * @return String object value, copy of the in-memory representation + * @see WordId#dic(int) + */ + String string(int dic, int stringPtr); + + WordInfoList wordInfos(int dic); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java index 22b6eb2c..c6a091f0 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java @@ -100,32 +100,6 @@ public int[] next() { } } - @Override - public int getWordId(String headword, short posId, String readingForm) { - for (int dictId = 1; dictId < lexicons.size(); dictId++) { - int wid = lexicons.get(dictId).getWordId(headword, posId, readingForm); - if (wid >= 0) { - return buildWordId(dictId, wid); - } - } - return lexicons.get(0).getWordId(headword, posId, readingForm); - } - - @Override - public short getLeftId(int wordId) { - return lexicons.get(WordId.dic(wordId)).getLeftId(getWordId(wordId)); - } - - @Override - public short getRightId(int wordId) { - return lexicons.get(WordId.dic(wordId)).getRightId(getWordId(wordId)); - } - - @Override - public short getCost(int wordId) { - return lexicons.get(WordId.dic(wordId)).getCost(getWordId(wordId)); - } - @Override public WordInfo getWordInfo(int wordId) { int dictionaryId = WordId.dic(wordId); @@ -176,4 +150,20 @@ public boolean isValid() { public void invalidate() { lexicons = null; } + + @Override + public long parameters(int wordId) { + int dic = WordId.dic(wordId); + return lexicons.get(dic).parameters(wordId); + } + + @Override + public String string(int dic, int stringPtr) { + return lexicons.get(dic).string(dic, stringPtr); + } + + @Override + public WordInfoList wordInfos(int dic) { + return lexicons.get(dic).wordInfos(dic); + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilder.java index 05a2d70c..e1280163 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilder.java @@ -88,7 +88,7 @@ public static void main(String[] args) throws IOException { List lexiconPaths = Arrays.asList(args).subList(i, args.length); try (BinaryDictionary system = new BinaryDictionary(sysDictPath)) { - DicBuilder.User builder = DicBuilder.user(system).description(description) + DicBuilder.User builder = DicBuilder.user(system).comment(description) .progress(new Progress(20, new DictionaryBuilder.StderrProgress())); for (String lexicon : lexiconPaths) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java index b79eb17b..3c2778cd 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java @@ -19,29 +19,28 @@ import com.worksap.nlp.sudachi.WordId; import java.nio.ByteBuffer; +import java.util.Iterator; class WordIdTable { private final ByteBuffer bytes; - private final int size; - private final int offset; private int dicIdMask = 0; - WordIdTable(ByteBuffer bytes, int offset) { + WordIdTable(ByteBuffer bytes) { this.bytes = bytes; - size = bytes.getInt(offset); - this.offset = offset + 4; - } - - int storageSize() { - return 4 + size; } Integer[] get(int index) { - int length = Byte.toUnsignedInt(bytes.get(offset + index++)); + ByteBuffer dup = bytes.duplicate(); + dup.position(index); + BufReader reader = new BufReader(dup); + int length = reader.readVarint32(); Integer[] result = new Integer[length]; + int mask = dicIdMask; + int sum = 0; for (int i = 0; i < length; i++) { - result[i] = bytes.getInt(offset + index); - index += 4; + int v = reader.readVarint32(); + result[i] = WordId.applyMask(v + sum, mask); + sum += v; } return result; } @@ -56,21 +55,52 @@ Integer[] get(int index) { * @return number of read IDs */ int readWordIds(int index, WordLookup lookup) { - int offset = this.offset + index; - ByteBuffer bytes = this.bytes; - int length = Byte.toUnsignedInt(bytes.get(offset)); - offset += 1; + ByteBuffer dup = bytes.duplicate(); + dup.position(index); + BufReader reader = new BufReader(dup); + int length = reader.readVarint32(); int[] result = lookup.outputBuffer(length); - int dicIdMask = this.dicIdMask; - for (int i = 0; i < length; i++) { - int wordId = bytes.getInt(offset); - result[i] = WordId.applyMask(wordId, dicIdMask); - offset += 4; - } + readDeltaCompressed(result, length, this.dicIdMask, reader); return length; } + private static void readDeltaCompressed(int[] result, int count, int mask, BufReader reader) { + int sum = 0; + for (int i = 0; i < count; ++i) { + int v = reader.readVarint32(); + result[i] = WordId.applyMask(v + sum, mask); + sum += v; + } + } + void setDictionaryId(int id) { dicIdMask = WordId.dicIdMask(id); } + + /** + * Iterates over all valid word ids in the dictionary. + * Iteration order is not the same as the original dictionary order, but dictionary ids, when sorted, form the correct order. + *
+ * The returned Ints object will be the same for each invocation of {@code next()}. + * @return iterator object + */ + public Iterator wordIds() { + return new Iterator() { + private final BufReader buf = new BufReader(bytes.duplicate()); + private final Ints ints = new Ints(16); + @Override + public boolean hasNext() { + return buf.remaining() > 0; + } + + @Override + public Ints next() { + BufReader r = buf; + int size = r.readVarint32(); + int[] data = ints.prepare(size); + readDeltaCompressed(data, size, dicIdMask, r); + return ints; + } + }; + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java index f945d64d..c703bd3a 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java @@ -21,73 +21,64 @@ import java.nio.ByteBuffer; /** - * Informations of the morpheme. + * Internal morpheme information. This class does not contain any strings. * *

- * This class has the informations which are not used in the graph calculation. + * This class holds morpheme data which is not used in the viterbi search. */ public class WordInfo { - - private final String surface; private final short headwordLength; private short posId; - private final int normalizedFormWordId; - private final String normalizedForm; - private final int dictionaryFormWordId; - private final String dictionaryForm; - private final String readingForm; + private final int surface; + private final int reading; + private final int normalizedForm; + private final int dictionaryForm; private final int[] aUnitSplit; private final int[] bUnitSplit; + private final int[] cUnitSplit; private final int[] wordStructure; private final int[] synonymGids; + private final String userData; - public WordInfo(String surface, short headwordLength, short posId, String normalizedForm, int dictionaryFormWordId, - String dictionaryForm, String readingForm, int[] aUnitSplit, int[] bUnitSplit, int[] wordStructure, - int[] synonymGids) { - this.surface = surface; + public WordInfo(short headwordLength, short posId, int surface, int reading, int normalizedForm, int dictionaryForm, + int[] aUnitSplit, int[] bUnitSplit, int[] cUnitSplit, int[] wordStructure, int[] synonymGids, + String userData) { this.headwordLength = headwordLength; this.posId = posId; - this.normalizedFormWordId = 0; + this.surface = surface; + this.reading = reading; this.normalizedForm = normalizedForm; - this.dictionaryFormWordId = dictionaryFormWordId; this.dictionaryForm = dictionaryForm; - this.readingForm = readingForm; this.aUnitSplit = aUnitSplit; this.bUnitSplit = bUnitSplit; + this.cUnitSplit = cUnitSplit; this.wordStructure = wordStructure; this.synonymGids = synonymGids; + this.userData = userData; } /** - * Allocates informations of morpheme not in the lexicons. + * Allocates morpheme information for ones not in the lexicon. + * For example, OOVs. * - * @param surface - * the text of the morpheme * @param headwordLength * the length of the morpheme * @param posId * the ID of the part-of-speech of the morpheme - * @param normalizedForm - * the normalized form of the morpheme - * @param dictionaryForm - * the dictionary form of the morpheme - * @param readingForm - * the reading form of the morpheme */ - public WordInfo(String surface, short headwordLength, short posId, String normalizedForm, String dictionaryForm, - String readingForm) { - this.surface = surface; + public WordInfo(short headwordLength, short posId) { this.headwordLength = headwordLength; this.posId = posId; - this.normalizedFormWordId = 0; - this.normalizedForm = normalizedForm; - this.dictionaryFormWordId = -1; - this.dictionaryForm = dictionaryForm; - this.readingForm = readingForm; - this.aUnitSplit = new int[0]; - this.bUnitSplit = new int[0]; - this.wordStructure = new int[0]; - this.synonymGids = new int[0]; + this.surface = 0; + this.normalizedForm = 0; + this.dictionaryForm = 0; + this.reading = 0; + this.aUnitSplit = Ints.EMPTY_ARRAY; + this.bUnitSplit = Ints.EMPTY_ARRAY; + this.cUnitSplit = Ints.EMPTY_ARRAY; + this.wordStructure = Ints.EMPTY_ARRAY; + this.synonymGids = Ints.EMPTY_ARRAY; + this.userData = ""; } /** @@ -95,7 +86,7 @@ public WordInfo(String surface, short headwordLength, short posId, String normal * * @return the text of the morpheme */ - public String getSurface() { + public int getSurface() { return surface; } @@ -115,7 +106,6 @@ public short getLength() { /** * Returns the part-of-speech ID of the morpheme. - * * The strings of part-of-speech name can be gotten with * {@link Grammar#getPartOfSpeechString}. * @@ -136,42 +126,33 @@ public void setPOSId(short posId) { } /** - * Returns the normalized form of the morpheme. + * Returns the entry id of the normalized form of the morpheme. * * @return the normalized form of the morpheme */ - public String getNormalizedForm() { + public int getNormalizedForm() { return normalizedForm; } /** * Returns the word ID of the dictionary form of the morpheme. - * * The information of the dictionary form can be gotten with * {@link Lexicon#getWordInfo} * * @return the word ID of the dictionary form of the morpheme */ - public int getDictionaryFormWordId() { - return dictionaryFormWordId; - } - - /** - * Returns the dictionary form of the morpheme. - * - * @return the dictionary form of the morpheme - */ - public String getDictionaryForm() { + public int getDictionaryForm() { return dictionaryForm; } /** - * Returns the reading form of the morpheme. + * Returns the raw string pointer to the reading form of the morpheme. * - * @return the reading form of the morpheme + * @return raw string pointer of the reading form + * @see StringPtr */ - public String getReadingForm() { - return readingForm; + public int getReadingForm() { + return reading; } /** @@ -210,7 +191,7 @@ public int[] getWordStructure() { */ @Deprecated public int[] getSynonymGoupIds() { - return synonymGids; + return getSynonymGroupIds(); } /** @@ -222,32 +203,78 @@ public int[] getSynonymGroupIds() { return synonymGids; } - public static WordInfo read(ByteBuffer buffer) { - short leftId = buffer.getShort(); - short rightId = buffer.getShort(); - short cost = buffer.getShort(); - short posId = buffer.getShort(); - int surfacePtr = buffer.getInt(); - int readingPtr = buffer.getInt(); - int normFormPtr = buffer.getInt(); - int dicFormPtr = buffer.getInt(); - short utf8Length = buffer.getShort(); - byte cSplitLen = buffer.get(); - byte bSplitLen = buffer.get(); - byte aSplitLen = buffer.get(); - byte wordStructureLen = buffer.get(); - byte synonymLen = buffer.get(); - byte userDataFlag = buffer.get(); - int[] cSplit = Ints.readArray(buffer, cSplitLen); - int[] bSplit = Ints.readArray(buffer, bSplitLen); - int[] aSplit = Ints.readArray(buffer, aSplitLen); - int[] wordStructure = Ints.readArray(buffer, wordStructureLen); - int[] synonyms = Ints.readArray(buffer, synonymLen); - - String userData = ""; + public String getUserData() { + return userData; + } + + public static int surfaceForm(ByteBuffer buffer, int pos) { + return buffer.getInt(pos + 8); + } + + public static int readingForm(ByteBuffer buffer, int pos) { + return buffer.getInt(pos + 12); + } + + + private WordInfo(ByteBuffer buffer, int pos) { + // short leftId = buffer.getShort(pos); + // short rightId = buffer.getShort(pos + 2); + // short cost = buffer.getShort(pos + 4); + // do not modify buffer metadata for better performance + posId = buffer.getShort(pos + 6); + surface = surfaceForm(buffer, pos); // +8 + reading = readingForm(buffer, pos); // +12 + normalizedForm = buffer.getInt(pos + 16); + dictionaryForm = buffer.getInt(pos + 20); + long rest = buffer.getLong(pos + 24); + headwordLength = (short) (rest & 0xffff); + rest >>>= 16; + if (rest == 0) { + cUnitSplit = Ints.EMPTY_ARRAY; + bUnitSplit = Ints.EMPTY_ARRAY; + aUnitSplit = Ints.EMPTY_ARRAY; + wordStructure = Ints.EMPTY_ARRAY; + synonymGids = Ints.EMPTY_ARRAY; + userData = ""; + return; + } + int cSplitLen = (int) ((rest) & 0xff); + int bSplitLen = (int) ((rest >>> 8) & 0xff); + int aSplitLen = (int) ((rest >>> 16) & 0xff); + int wordStructureLen = (int) ((rest >>> 24) & 0xff); + int synonymLen = (int) ((rest >>> 32) & 0xff); + int userDataFlag = (int) ((rest >>> 40) & 0xff); + int offset = pos + 32; + cUnitSplit = Ints.readArray(buffer, offset, cSplitLen); + offset += cSplitLen * 4; + if (bSplitLen == 0xff) { + bUnitSplit = cUnitSplit; + } else { + bUnitSplit = Ints.readArray(buffer, offset, bSplitLen); + offset += bSplitLen * 4; + } + if (aSplitLen == 0xff) { + aUnitSplit = bUnitSplit; + } else { + aUnitSplit = Ints.readArray(buffer, offset, aSplitLen); + offset += aSplitLen * 4; + } + if (wordStructureLen == 0xff) { + wordStructure = aUnitSplit; + offset += wordStructureLen * 4; + } else { + wordStructure = Ints.readArray(buffer, offset, wordStructureLen); + } + synonymGids = Ints.readArray(buffer, offset, synonymLen); + if (userDataFlag != 0) { userData = StringUtil.readLengthPrefixed(buffer); + } else { + userData = ""; } - throw new IllegalArgumentException(); + } + + public static WordInfo read(ByteBuffer buffer, int pos) { + return new WordInfo(buffer, pos); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfoList.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfoList.java index 4dc2315c..6d9b0826 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfoList.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfoList.java @@ -16,92 +16,25 @@ package com.worksap.nlp.sudachi.dictionary; -import java.nio.Buffer; import java.nio.ByteBuffer; -class WordInfoList { - +public class WordInfoList { private final ByteBuffer bytes; - private final int offset; - private final int wordSize; - private final boolean hasSynonymGid; - WordInfoList(ByteBuffer bytes, int offset, int wordSize, boolean hasSysnoymGid) { + WordInfoList(ByteBuffer bytes) { this.bytes = bytes; - this.offset = offset; - this.wordSize = wordSize; - this.hasSynonymGid = hasSysnoymGid; - } - - WordInfo getWordInfo(int wordId) { - ByteBuffer buf = bytes.asReadOnlyBuffer(); - buf.order(bytes.order()); - ((Buffer) buf).position(wordIdToOffset(wordId)); // a kludge for Java 9 - - String surface = bufferToString(buf); - short headwordLength = (short) bufferToStringLength(buf); - short posId = buf.getShort(); - String normalizedForm = bufferToString(buf); - if (normalizedForm.isEmpty()) { - normalizedForm = surface; - } - int dictionaryFormWordId = buf.getInt(); - String readingForm = bufferToString(buf); - if (readingForm.isEmpty()) { - readingForm = surface; - } - int[] aUnitSplit = bufferToIntArray(buf); - int[] bUnitSplit = bufferToIntArray(buf); - int[] wordStructure = bufferToIntArray(buf); - - int[] synonymGids = new int[0]; - if (hasSynonymGid) { - synonymGids = bufferToIntArray(buf); - } - - String dictionaryForm = surface; - if (dictionaryFormWordId >= 0 && dictionaryFormWordId != wordId) { - WordInfo wi = getWordInfo(dictionaryFormWordId); - dictionaryForm = wi.getSurface(); - } - - return new WordInfo(surface, headwordLength, posId, normalizedForm, dictionaryFormWordId, dictionaryForm, - readingForm, aUnitSplit, bUnitSplit, wordStructure, synonymGids); - } - - int size() { - return wordSize; - } - - private int wordIdToOffset(int wordId) { - return bytes.getInt(offset + 4 * wordId); } - private int bufferToStringLength(ByteBuffer buffer) { - byte length = buffer.get(); - if (length < 0) { - int high = Byte.toUnsignedInt(length); - int low = Byte.toUnsignedInt(buffer.get()); - return ((high & 0x7F) << 8) | low; - } - return length; + public WordInfo getWordInfo(int wordId) { + int position = wordId * 8; + return WordInfo.read(bytes, position); } - private String bufferToString(ByteBuffer buffer) { - int length = bufferToStringLength(buffer); - char[] str = new char[length]; - for (int i = 0; i < length; i++) { - str[i] = buffer.getChar(); - } - return new String(str); + public int surfacePtr(int wordId) { + return WordInfo.surfaceForm(bytes, wordId * 8); } - private int[] bufferToIntArray(ByteBuffer buffer) { - int length = Byte.toUnsignedInt(buffer.get()); - int[] array = new int[length]; - for (int i = 0; i < length; i++) { - array[i] = buffer.getInt(); - } - return array; + public int readingPtr(int wordId) { + return WordInfo.readingForm(bytes, wordId * 8); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameters.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameters.java new file mode 100644 index 00000000..6025ca04 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameters.java @@ -0,0 +1,51 @@ +package com.worksap.nlp.sudachi.dictionary; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +public class WordParameters { + private final ByteBuffer data; + + private WordParameters(ByteBuffer data) { + this.data = data; + } + + public long loadParams(int wordId) { + int addr = wordId * 8; + return data.getLong(addr); + } + + public void setCost(int wordId, short cost) { + int addr = wordId * 8 + 6; + data.putShort(addr, cost); + } + + public static WordParameters readOnly(ByteBuffer full, Description desc) { + ByteBuffer data = desc.slice(full, Blocks.ENTRIES); + data.order(ByteOrder.LITTLE_ENDIAN); + return new WordParameters(data); + } + + public static WordParameters readWrite(ByteBuffer full, Description desc) { + WordParameters ro = readOnly(full, desc); + ByteBuffer roBuf = ro.data; + int lim = roBuf.limit(); + ByteBuffer buf = ByteBuffer.allocate(lim); + buf.order(ByteOrder.LITTLE_ENDIAN); + roBuf.put(buf); + buf.position(0); + return new WordParameters(buf); + } + + public static short leftId(long packed) { + return (short) (packed & 0xffff); + } + + public static short rightId(long packed) { + return (short) ((packed >>> 16) & 0xffff); + } + + public static short cost(long packed) { + return (short) ((packed >>> 32) & 0xffff); + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java index 63947e9f..07a79a6a 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java @@ -16,14 +16,18 @@ package com.worksap.nlp.sudachi.dictionary.build; +import com.worksap.nlp.sudachi.dictionary.Description; + import java.io.IOException; import java.nio.channels.SeekableByteChannel; import java.util.ArrayList; import java.util.List; +import java.util.StringJoiner; public class BlockLayout { private final SeekableByteChannel channel; private final Progress progress; + private final List info = new ArrayList<>(); public BlockLayout(SeekableByteChannel channel, Progress progress) throws IOException { this.channel = channel; @@ -42,7 +46,14 @@ public T block(String name, BlockHandler handler) throws IOException { return result; } - private final static List info = new ArrayList<>(); + public List blocks() { + List result = new ArrayList<>(); + for (BlockInfo b: info) { + Description.Block published = new Description.Block(b.name, b.start, b.end - b.start); + result.add(published); + } + return result; + } private static class BlockInfo { String name; @@ -54,5 +65,11 @@ public BlockInfo(String name, long start, long end) { this.start = start; this.end = end; } + + @Override + public String toString() { + return new StringJoiner(", ", BlockInfo.class.getSimpleName() + "[", "]").add("name='" + name + "'") + .add("start=" + start).add("end=" + end).toString(); + } } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java index f8688134..117aef7a 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java @@ -35,7 +35,7 @@ public BufWriter putByte(byte val) { // Encode int as LEB128 public BufWriter putVarint32(int val) { - if ((val & 0xff) == 0) { + if ((val & ~0x7f) == 0) { putByte((byte) val); } else { putVarintSlow(val & 0xffff_ffffL); @@ -44,7 +44,7 @@ public BufWriter putVarint32(int val) { } public BufWriter putVarint64(long val) { - if ((val & 0xff) == 0) { + if ((val & ~0x7fL) == 0) { putByte((byte) val); } else { putVarintSlow(val); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ChanneledBuffer.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ChanneledBuffer.java index e1cb497b..a51f5145 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ChanneledBuffer.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ChanneledBuffer.java @@ -86,6 +86,7 @@ public BufWriter writer(int maxLength) throws IOException { } public void flush() throws IOException { + buffer.flip(); channel.write(buffer); buffer.clear(); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java index ed9834ea..fbee88e1 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java @@ -38,7 +38,7 @@ public class CsvLexicon implements WriteDictionary { private final Parameters parameters = new Parameters(); private final POSTable posTable; private final List entries = new ArrayList<>(); - private WordIdResolver widResolver = new WordLookup.Noop(); + private WordIdResolver widResolver = null; public CsvLexicon(POSTable pos) { posTable = pos; @@ -99,19 +99,6 @@ RawWordEntry parseLine(List cols) { if (cols.get(14).equals("A") && (!entry.aUnitSplitString.equals("*") || !entry.bUnitSplitString.equals("*"))) { throw new IllegalArgumentException("invalid splitting"); } - - int[] synonymGids = new int[0]; - if (cols.size() > 18) { - synonymGids = parseSynonymGids(cols.get(18)); - } - - entry.wordInfo = new WordInfo(cols.get(4), // headword - (short) cols.get(0).getBytes(StandardCharsets.UTF_8).length, posId, cols.get(12), // normalizedForm - (cols.get(13).equals("*") ? -1 : Integer.parseInt(cols.get(13))), // dictionaryFormWordId - "", // dummy - cols.get(11), // readingForm - null, null, null, synonymGids); - return entry; } @@ -191,53 +178,7 @@ int parseId(String text) { @Override public void writeTo(ModelOutput output) throws IOException { - // write number of entries - ByteBuffer buf = ByteBuffer.allocate(4); - buf.order(ByteOrder.LITTLE_ENDIAN); - buf.putInt(entries.size()); - buf.flip(); - output.write(buf); - - parameters.writeTo(output); - - int offsetsSize = 4 * entries.size(); - DicBuffer offsets = new DicBuffer(offsetsSize); - long offsetsPosition = output.position(); - // make a hole for - output.position(offsetsPosition + offsetsSize); - - output.withPart("word entries", () -> { - DicBuffer buffer = new DicBuffer(128 * 1024); - int offset = (int) output.position(); - int numEntries = entries.size(); - for (int i = 0; i < numEntries; ++i) { - RawWordEntry entry = entries.get(i); - if (buffer.wontFit(16 * 1024)) { - offset += buffer.consume(output::write); - } - offsets.putInt(offset + buffer.position()); - - WordInfo wi = entry.wordInfo; - buffer.put(wi.getSurface()); - buffer.putLength(wi.getLength()); - buffer.putShort(wi.getPOSId()); - buffer.putEmptyIfEqual(wi.getNormalizedForm(), wi.getSurface()); - buffer.putInt(wi.getDictionaryFormWordId()); - buffer.putEmptyIfEqual(wi.getReadingForm(), wi.getSurface()); - buffer.putInts(parseSplitInfo(entry.aUnitSplitString)); - buffer.putInts(parseSplitInfo(entry.bUnitSplitString)); - buffer.putInts(parseSplitInfo(entry.wordStructureString)); - buffer.putInts(wi.getSynonymGroupIds()); - output.progress(i, numEntries); - } - - buffer.consume(output::write); - }); - long pos = output.position(); - output.position(offsetsPosition); - output.withPart("WordInfo offsets", () -> offsets.consume(output::write)); - output.position(pos); } public int addEntry(RawWordEntry e) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java index ec8c1342..2444f0c0 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2022 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,191 +16,253 @@ package com.worksap.nlp.sudachi.dictionary.build; -import com.worksap.nlp.sudachi.dictionary.*; +import com.worksap.nlp.sudachi.dictionary.Blocks; +import com.worksap.nlp.sudachi.dictionary.Description; +import com.worksap.nlp.sudachi.dictionary.DictionaryAccess; import java.io.IOException; import java.io.InputStream; -import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; -import java.nio.ByteBuffer; import java.nio.channels.SeekableByteChannel; -import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; +import java.nio.file.Paths; +import java.nio.file.StandardOpenOption; +import java.time.Instant; +import java.util.Objects; +import static java.lang.System.nanoTime; + +/** + * Fluid API for building a binary dictionary from a CSV file. + * See documentation for the format of the CSV dictionary. + */ public class DicBuilder { private DicBuilder() { - /* instantiations are forbidden */ - } - - public static SystemNoMatrix system() { - return new SystemNoMatrix(new System()); + // no instances } - public static User user(DictionaryAccess system) { - return new User(system); - } - - public static abstract class Base> { + private static class Base> { protected final POSTable pos = new POSTable(); protected final ConnectionMatrix connection = new ConnectionMatrix(); - protected final Index index = new Index(); - protected String description = ""; - protected long version; - protected long creationTime = java.lang.System.currentTimeMillis(); - private final List inputs = new ArrayList<>(); - private Progress progress; - - protected WordIdResolver resolver() { - return new WordLookup.Csv(lexicon); - } + protected Progress progress = Progress.NOOP; + protected RawLexicon lexicon = new RawLexicon(); + protected final Description description = new Description(); @SuppressWarnings("unchecked") private T self() { return (T) this; } - protected final CsvLexicon lexicon = new CsvLexicon(pos); - - public BuildStats build(SeekableByteChannel result) throws IOException { - lexicon.setResolver(resolver()); - ModelOutput output = new ModelOutput(result); - if (progress != null) { - output.progressor(progress); + /** + * Import words from the csv lexicon into the binary dictionary compiler. + * + * @param name name of input file + * @param input factory for the InputStream with the lexicon content. May be called several times. + * @param size total size of the file in bytes. Used for reporting progress and can be not very precise. + * @return current object + * @throws IOException when IO fails + */ + public T lexicon(String name, IOSupplier input, long size) throws IOException { + progress.startBlock(name, nanoTime(), Progress.Kind.INPUT); + try (InputStream is = input.get()) { + InputStream stream = new TrackingInputStream(is); + lexicon.read(name, stream, pos); } - DictionaryHeader header = new DictionaryHeader(version, creationTime, description); - - ByteBuffer headerBuffer = ByteBuffer.wrap(header.toByte()); - - output.write(headerBuffer); - pos.writeTo(output); - connection.writeTo(output); - index.writeTo(output); - lexicon.writeTo(output); - return new BuildStats(inputs, output.getParts()); + progress.endBlock(size, nanoTime()); + return self(); } - public T lexicon(URL data) throws IOException { - URLConnection conn = data.openConnection(); - try (InputStream is = conn.getInputStream()) { - long length = data.openConnection().getContentLengthLong(); - return lexiconImpl(data.getPath(), is, length); - } + /** + * Import words from the csv lexicon into the binary dictionary compiler. + * This method is for loading resources from classpath mostly, remote access is untested. + * + * @param url pointing to the + * @return current object + * @throws IOException when IO fails + * @see Class#getResource(String) + * @see ClassLoader#getResource(String) + */ + public T lexicon(URL url) throws IOException { + String name = url.getPath(); + URLConnection conn = url.openConnection(); + long size = conn.getContentLengthLong(); + return lexicon(name, conn::getInputStream, size); } + /** + * Import words from the csv lexicon into the binary dictionary compiler. + * + * @param path csv file + * @return current object + * @throws IOException when IO fails + */ public T lexicon(Path path) throws IOException { - try (InputStream is = Files.newInputStream(path)) { - return lexiconImpl(path.getFileName().toString(), is, Files.size(path)); - } + String name = path.getFileName().toString(); + long size = Files.size(path); + return lexicon(name, () -> Files.newInputStream(path), size); } - public T lexicon(InputStream data) throws IOException { - return lexiconImpl("", data, data.available()); + /** + * Set the progress handler to the provided one + * @param progress handler + * @return current object + */ + public T progress(Progress progress) { + this.progress = Objects.requireNonNull(progress); + return self(); } - public T lexiconImpl(String name, InputStream data, long size) throws IOException { - long startTime = java.lang.System.nanoTime(); - if (progress != null) { - progress.startBlock(name, startTime, Progress.Kind.INPUT); - } - - TrackingInputStream tracker = new TrackingInputStream(data); - CSVParser parser = new CSVParser(new InputStreamReader(tracker, StandardCharsets.UTF_8)); - int line = 1; - while (true) { - List fields = parser.getNextRecord(); - if (fields == null) - break; - try { - RawWordEntry e = lexicon.parseLine(fields); - int wordId = lexicon.addEntry(e); - if (e.headword != null) { - index.add(e.headword, wordId); - } - line += 1; - } catch (Exception e) { - throw new InputFileException(line, fields.get(0), e); - } - if (progress != null) { - progress.progress(tracker.getPosition(), size); - } - } - - long time = java.lang.System.nanoTime() - startTime; - if (progress != null) { - progress.endBlock(line, time); - } - - inputs.add(new ModelOutput.Part(name, time, line)); - + /** + * Set the comment string in the binary dictionary + * @param comment provided string + * @return current object + */ + public T comment(String comment) { + description.setComment(Objects.requireNonNull(comment)); return self(); } - public T description(String description) { - this.description = description; + /** + * Set the dictionary compilation time + * @param instant time to set + * @return current object + */ + public T compilationTime(Instant instant) { + description.setCompilationTime(Objects.requireNonNull(instant)); return self(); } - public T progress(Progress progress) { - this.progress = progress; - return self(); + /** + * Compile the binary dictionary and write it to the proviced channel + * @param channel contents will be written here + * @throws IOException if io fails + */ + public void build(SeekableByteChannel channel) throws IOException { + BlockLayout layout = new BlockLayout(channel, progress); + if (connection.nonEmpty()) { + layout.block(Blocks.CONNECTION_MATRIX, connection::compile); + } + layout.block(Blocks.POS_TABLE, pos::compile); + lexicon.compile(pos, layout); + description.setBlocks(layout.blocks()); + description.setNumberOfEntries(lexicon.getIndexedEntries(), lexicon.getTotalEntries()); + description.setRuntimeCosts(lexicon.hasRuntimeCosts()); + description.save(channel); } } public static final class System extends Base { - public System() { - version = DictionaryVersion.SYSTEM_DICT_VERSION_2; + private System readMatrix(String name, IOSupplier input, long size) throws IOException { + progress.startBlock(name, nanoTime(), Progress.Kind.INPUT); + try (InputStream is = input.get()) { + InputStream stream = new ProgressInputStream(is, size, progress); + connection.readEntries(stream); + } + progress.endBlock(size, nanoTime()); + return this; } - private void readMatrix(InputStream matrix) throws IOException { - connection.readEntries(matrix); - lexicon.setLimits(connection.getNumLeft(), connection.getNumRight()); + /** + * Set the system dictionary signature to the provided string. + * By default, it is current timestamp and a random 8 hexadecimal characters. + * @param signature provided dictionary signature. Can not be empty. + * @return current object + */ + public System signature(String signature) { + if (signature == null) { + throw new IllegalArgumentException("signature can not be null"); + } + if (signature.isEmpty()) { + throw new IllegalArgumentException("signature can not be empty"); + } + description.setSignature(signature); + return this; } } - public static final class User extends Base { - final DictionaryAccess dictionary; + /** + * Typestate pattern for system dictionary that does not have connection matrix added yet + */ + public static final class SystemNoMatrix { + private final System inner; - private User(DictionaryAccess dictionary) { - this.dictionary = dictionary; - this.version = DictionaryVersion.USER_DICT_VERSION_3; - Connection conn = dictionary.getGrammar().getConnection(); - lexicon.setLimits(conn.getLeftSize(), conn.getRightSize()); - connection.makeEmpty(); - pos.preloadFrom(dictionary.getGrammar()); + private SystemNoMatrix(DicBuilder.System inner) { + this.inner = inner; } - @Override - protected WordIdResolver resolver() { - return new WordLookup.Chain(new WordLookup.Prebuilt(dictionary.getLexicon()), new WordLookup.Csv(lexicon)); + /** + * Read connection matrix from MeCab matrix.def format text file. + * @param name name of the file + * @param data factory for the InputStream which contains the file. This can be called more than once. + * @param size total number of bytes for the file. This information will be only used for calculating progress. + * @return system dictionary builder + * @throws IOException if IO fails + */ + public DicBuilder.System matrix(String name, IOSupplier data, long size) throws IOException { + return inner.readMatrix(name, data, size); } - } - public static final class SystemNoMatrix { - private final System inner; - - private SystemNoMatrix(System inner) { - this.inner = inner; + /** + * Read connection matrix from MeCab matrix.def format text file. Classpath version. + * @param data name of the file + * @return system dictionary builder + * @throws IOException if IO fails + */ + public DicBuilder.System matrix(URL data) throws IOException { + String name = data.getPath(); + URLConnection conn = data.openConnection(); + long size = conn.getContentLengthLong(); + return matrix(name, conn::getInputStream, size); } - public System matrix(InputStream data) throws IOException { - inner.readMatrix(data); - return inner; + /** + * Read connection matrix from MeCab matrix.def format text file. Filesystem version. + * @param path path to matrix.def format file + * @return system dictionary builder + * @throws IOException if IO fails + */ + public DicBuilder.System matrix(Path path) throws IOException { + String name = path.getFileName().toString(); + long size = Files.size(path); + return matrix(name, () -> Files.newInputStream(path), size); } + } - public System matrix(URL data) throws IOException { - try (InputStream is = data.openStream()) { - return matrix(is); - } + public static final class User extends Base { + private User(DictionaryAccess system) { + pos.preloadFrom(system.getGrammar()); + description.setSignature(""); } + } - public System matrix(Path path) throws IOException { - try (InputStream is = Files.newInputStream(path)) { - return matrix(is); - } + /** + * Create a new system dictionary compiler + * @return new dictionary compiler object + */ + public static SystemNoMatrix system() { + return new SystemNoMatrix(new System()); + } + + /** + * Create a new user dictionary compiler which will reference the provided user dictionary. + * @param system referenced dictionary + * @return new dictionary compiler object + */ + public static User user(DictionaryAccess system) { + return new User(system); + } + + public static void main(String[] args) throws IOException { + Base b = new Base<>(); + Path input = Paths.get(args[0]); + b.lexicon(input); + Path output = Paths.get(args[1]); + Files.createDirectories(output.getParent()); + try (SeekableByteChannel chan = Files.newByteChannel(output, StandardOpenOption.WRITE, + StandardOpenOption.CREATE)) { + b.build(chan); } } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder2.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder2.java deleted file mode 100644 index e786c509..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder2.java +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright (c) 2022 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build; - -import com.worksap.nlp.sudachi.dictionary.Blocks; -import com.worksap.nlp.sudachi.dictionary.Description; -import com.worksap.nlp.sudachi.dictionary.DictionaryAccess; - -import java.io.IOException; -import java.io.InputStream; -import java.net.URL; -import java.net.URLConnection; -import java.nio.channels.SeekableByteChannel; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.nio.file.StandardOpenOption; - -import static java.lang.System.nanoTime; - -public class DicBuilder2 { - private DicBuilder2() { - // no instances - } - - private static class Base> { - protected final POSTable pos = new POSTable(); - protected final ConnectionMatrix connection = new ConnectionMatrix(); - protected Progress progress = Progress.NOOP; - protected RawLexicon lexicon = new RawLexicon(); - protected final Description description = new Description(); - - @SuppressWarnings("unchecked") - private T self() { - return (T) this; - } - - public T lexicon(String name, IOSupplier input, long size) throws IOException { - progress.startBlock(name, nanoTime(), Progress.Kind.INPUT); - try (InputStream is = input.get()) { - InputStream stream = new TrackingInputStream(is); - lexicon.read(name, stream, pos); - } - progress.endBlock(size, nanoTime()); - return self(); - } - - public T lexicon(URL url) throws IOException { - String name = url.getPath(); - URLConnection conn = url.openConnection(); - long size = conn.getContentLengthLong(); - return lexicon(name, conn::getInputStream, size); - } - - public T lexicon(Path path) throws IOException { - String name = path.getFileName().toString(); - long size = Files.size(path); - return lexicon(name, () -> Files.newInputStream(path), size); - } - - public void write(SeekableByteChannel channel) throws IOException { - BlockLayout layout = new BlockLayout(channel, progress); - if (connection.nonEmpty()) { - layout.block(Blocks.CONNECTION_MATRIX, connection::compile); - } - layout.block(Blocks.POS_TABLE, pos::compile); - lexicon.compile(pos, layout); - } - } - - public static final class System extends Base { - private System readMatrix(String name, IOSupplier input, long size) throws IOException { - progress.startBlock(name, nanoTime(), Progress.Kind.INPUT); - try (InputStream is = input.get()) { - InputStream stream = new ProgressInputStream(is, size, progress); - connection.readEntries(stream); - } - progress.endBlock(size, nanoTime()); - return this; - } - } - - public static final class SystemNoMatrix { - private final System inner; - - private SystemNoMatrix(DicBuilder2.System inner) { - this.inner = inner; - } - - public DicBuilder2.System matrix(String name, IOSupplier data, long size) throws IOException { - return inner.readMatrix(name, data, size); - } - - public DicBuilder2.System matrix(URL data) throws IOException { - String name = data.getPath(); - URLConnection conn = data.openConnection(); - long size = conn.getContentLengthLong(); - return matrix(name, conn::getInputStream, size); - } - - public DicBuilder2.System matrix(Path path) throws IOException { - String name = path.getFileName().toString(); - long size = Files.size(path); - return matrix(name, () -> Files.newInputStream(path), size); - } - } - - public static final class User extends Base { - private User(DictionaryAccess system) { - pos.preloadFrom(system.getGrammar()); - description.setSignature(""); - } - } - - public static SystemNoMatrix system() { - return new SystemNoMatrix(new System()); - } - - public static User user(DictionaryAccess system) { - return new User(system); - } - - public static void main(String[] args) throws IOException { - Base b = new Base<>(); - Path input = Paths.get(args[0]); - b.lexicon(input.getFileName().toString(), () -> Files.newInputStream(input), Files.size(input)); - Path output = Paths.get(args[1]); - Files.createDirectories(output.getParent()); - try (SeekableByteChannel chan = Files.newByteChannel(output, StandardOpenOption.WRITE, - StandardOpenOption.CREATE)) { - b.write(chan); - } - } -} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java index 2cef9ba3..1426e74b 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java @@ -45,9 +45,10 @@ public void reserve(int needed) { @Override public int read(ByteBuffer dst) throws IOException { - int position = buffer.position(); - buffer.put(dst); - int newPosition = buffer.position(); + ByteBuffer src = buffer; + int position = src.position(); + dst.put(src); + int newPosition = src.position(); return newPosition - position; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java index 7edf0724..a6b2c44c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java @@ -89,6 +89,7 @@ public Void compile(BlockOutput out) throws IOException { } p.progress(i, table.size()); } + cbuf.flush(); return null; }); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index 64cc2f31..62801b34 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -18,6 +18,7 @@ import com.worksap.nlp.sudachi.dictionary.Blocks; import com.worksap.nlp.sudachi.dictionary.CSVParser; +import com.worksap.nlp.sudachi.dictionary.DoubleArrayLexicon; import java.io.IOException; import java.io.InputStream; @@ -38,6 +39,7 @@ public class RawLexicon { private boolean user; private long offset = INITIAL_OFFSET; + private boolean runtimeCosts = false; public void read(String name, InputStream data, POSTable posTable) throws IOException { read(name, new InputStreamReader(data, StandardCharsets.UTF_8), posTable); @@ -61,6 +63,7 @@ public void read(String name, Reader data, POSTable posTable) throws IOException } else { notIndexed.add(entry); } + this.runtimeCosts |= !DoubleArrayLexicon.isNormalCost(entry.cost); } this.offset = offset; } @@ -103,6 +106,7 @@ private Void writeEntries(POSTable pos, BlockOutput blockOutput) throws IOExcept ptr = layout.put(e); p.progress(i, size); } + buf.flush(); return null; }); } @@ -114,4 +118,16 @@ private Void writeStrings(BlockOutput blockOutput) throws IOException { return null; }); } + + public int getIndexedEntries() { + return this.entries.size() - this.notIndexed.size(); + } + + public int getTotalEntries() { + return this.entries.size(); + } + + public boolean hasRuntimeCosts() { + return this.runtimeCosts; + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLookup.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLookup.java index 749eabba..f2be0dce 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLookup.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLookup.java @@ -24,123 +24,4 @@ public class WordLookup { private WordLookup() { } - - public static class Noop implements WordIdResolver { - @Override - public int lookup(String headword, short posId, String reading) { - return -1; - } - - @Override - public void validate(int wordId) { - // noop validator always works - } - - @Override - public boolean isUser() { - return false; - } - } - - public static class Csv implements WordIdResolver { - private final CsvLexicon lexicon; - - public Csv(CsvLexicon lexicon) { - this.lexicon = lexicon; - } - - @Override - public int lookup(String headword, short posId, String reading) { - List entries = lexicon.getEntries(); - for (int i = 0; i < entries.size(); ++i) { - RawWordEntry entry = entries.get(i); - if (entry.wordInfo.getSurface().equals(headword) && entry.wordInfo.getPOSId() == posId - && entry.wordInfo.getReadingForm().equals(reading)) { - return i; - } - } - return -1; - } - - @Override - public void validate(int wordId) { - if (wordId < 0) { - throw new IllegalArgumentException("wordId can't be negative, was " + wordId); - } - List entries = lexicon.getEntries(); - if (wordId >= entries.size()) { - throw new IllegalArgumentException(String - .format("wordId %d was larger than number of dictionary entries (%d)", wordId, entries.size())); - } - } - - @Override - public boolean isUser() { - return false; - } - } - - public static class Prebuilt implements WordIdResolver { - private final Lexicon lexicon; - private final int prebuiltSize; - - public Prebuilt(Lexicon lexicon) { - this.lexicon = lexicon; - this.prebuiltSize = lexicon.size(); - } - - @Override - public int lookup(String headword, short posId, String reading) { - return lexicon.getWordId(headword, posId, reading); - } - - @Override - public void validate(int wordId) { - int word = WordId.word(wordId); - if (word > prebuiltSize) { - throw new IllegalArgumentException("WordId was larger than the number of dictionary entries"); - } - } - - @Override - public boolean isUser() { - return false; - } - } - - public static class Chain implements WordIdResolver { - private final WordIdResolver system; - private final WordIdResolver user; - - public Chain(WordIdResolver system, WordIdResolver user) { - this.system = system; - this.user = user; - } - - @Override - public int lookup(String headword, short posId, String reading) { - int wid = user.lookup(headword, posId, reading); - if (wid == -1) { - return system.lookup(headword, posId, reading); - } - return WordId.make(1, wid); - } - - @Override - public void validate(int wordId) { - int dic = WordId.dic(wordId); - if (dic == 0) { - system.validate(wordId); - } else if (dic == 1) { - user.validate(WordId.word(wordId)); - } else { - throw new IllegalArgumentException("dictionary id can be only 0 or 1 at the build time"); - } - } - - @Override - public boolean isUser() { - return true; - } - } } diff --git a/src/test/java/com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java b/src/test/java/com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java index a326983d..40828bf3 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java @@ -41,7 +41,7 @@ public void testKatakanaLength() { // アイ, アイウ in the dictionary plugin.minLength = 0; - List path = getPath("アイアイウ"); + List path = getPath("アイアイウ"); assertEquals(2, path.size()); plugin.minLength = 1; @@ -61,7 +61,7 @@ public void testKatakanaLength() { public void testPOS() { // アイアイウ is 名詞-固有名詞-地名-一般 in the dictionary plugin.minLength = 3; - List path = getPath("アイアイウ"); + List path = getPath("アイアイウ"); assertEquals(1, path.size()); assertFalse(path.get(0).isOOV()); // use the word in dictionary } @@ -69,32 +69,32 @@ public void testPOS() { @Test public void testStartWithMiddle() { plugin.minLength = 3; - List path = getPath("アイウアイアイウ"); + List path = getPath("アイウアイアイウ"); assertEquals(1, path.size()); } @Test public void testStartWithTail() { plugin.minLength = 3; - List path = getPath("アイウアイウアイ"); + List path = getPath("アイウアイウアイ"); assertEquals(1, path.size()); } @Test public void testWithNOOOVBOW() { plugin.minLength = 3; - List path = getPath("ァアイアイウ"); + List path = getPath("ァアイアイウ"); assertEquals(2, path.size()); - assertEquals("ァ", path.get(0).getWordInfo().getSurface()); + assertEquals("ァ", path.get(0).getBaseSurface()); path = getPath("アイウァアイウ"); assertEquals(1, path.size()); } - private List getPath(String text) { + private List getPath(String text) { UTF8InputText input = new UTF8InputTextBuilder(text, tokenizer.grammar).build(); LatticeImpl lattice = tokenizer.buildLattice(input); - List path = lattice.getBestPath(); + List path = lattice.getBestPath(); plugin.rewrite(input, path, lattice); lattice.clear(); return path; diff --git a/src/test/java/com/worksap/nlp/sudachi/JoinNumericPluginTest.java b/src/test/java/com/worksap/nlp/sudachi/JoinNumericPluginTest.java index d2721742..4ea9a7a5 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JoinNumericPluginTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JoinNumericPluginTest.java @@ -42,22 +42,22 @@ public void setUp() throws IOException { @Test public void testDigit() { - List path = getPath("123円20銭"); + List path = getPath("123円20銭"); assertEquals(4, path.size()); - assertEquals("123", path.get(0).getWordInfo().getSurface()); - assertEquals("20", path.get(2).getWordInfo().getSurface()); + assertEquals("123", path.get(0).getBaseSurface()); + assertEquals("20", path.get(2).getBaseSurface()); path = getPath("080-121"); assertEquals(3, path.size()); - assertEquals("080", path.get(0).getWordInfo().getSurface()); - assertEquals("121", path.get(2).getWordInfo().getSurface()); + assertEquals("080", path.get(0).getBaseSurface()); + assertEquals("121", path.get(2).getBaseSurface()); } @Test public void testKanjiNumeric() { - List path = getPath("一二三万二千円"); + List path = getPath("一二三万二千円"); assertEquals(2, path.size()); - assertEquals("一二三万二千", path.get(0).getWordInfo().getSurface()); + assertEquals("一二三万二千", path.get(0).getBaseSurface()); path = getPath("二百百"); assertEquals(3, path.size()); @@ -66,107 +66,107 @@ public void testKanjiNumeric() { @Test public void testNormalize() { plugin.enableNormalize = true; - List path = getPath("一二三万二千円"); + List path = getPath("一二三万二千円"); assertEquals(2, path.size()); - assertEquals("1232000", path.get(0).getWordInfo().getNormalizedForm()); + assertEquals("1232000", path.get(0).getBaseSurface()); } @Test public void testNormalizeWithNotNumeric() { plugin.enableNormalize = true; - List path = getPath("六三四"); + List path = getPath("六三四"); assertEquals(1, path.size()); - assertEquals("六三四", path.get(0).getWordInfo().getNormalizedForm()); + assertEquals("六三四", path.get(0).getNormalizedForm()); } @Test public void testPoint() { plugin.enableNormalize = true; - List path = getPath("1.002"); + List path = getPath("1.002"); assertEquals(1, path.size()); - assertEquals("1.002", path.get(0).getWordInfo().getNormalizedForm()); + assertEquals("1.002", path.get(0).getNormalizedForm()); path = getPath(".002"); assertEquals(2, path.size()); - assertEquals(".", path.get(0).getWordInfo().getNormalizedForm()); - assertEquals("002", path.get(1).getWordInfo().getNormalizedForm()); + assertEquals(".", path.get(0).getNormalizedForm()); + assertEquals("002", path.get(1).getNormalizedForm()); path = getPath("22."); assertEquals(2, path.size()); - assertEquals("22", path.get(0).getWordInfo().getNormalizedForm()); - assertEquals(".", path.get(1).getWordInfo().getNormalizedForm()); + assertEquals("22", path.get(0).getNormalizedForm()); + assertEquals(".", path.get(1).getNormalizedForm()); path = getPath("22.節"); assertEquals(3, path.size()); - assertEquals("22", path.get(0).getWordInfo().getNormalizedForm()); - assertEquals(".", path.get(1).getWordInfo().getNormalizedForm()); + assertEquals("22", path.get(0).getNormalizedForm()); + assertEquals(".", path.get(1).getNormalizedForm()); path = getPath(".c"); assertEquals(2, path.size()); - assertEquals(".", path.get(0).getWordInfo().getNormalizedForm()); + assertEquals(".", path.get(0).getNormalizedForm()); path = getPath("1.20.3"); assertEquals(5, path.size()); - assertEquals("20", path.get(2).getWordInfo().getNormalizedForm()); + assertEquals("20", path.get(2).getNormalizedForm()); path = getPath("652..."); assertEquals(4, path.size()); - assertEquals("652", path.get(0).getWordInfo().getNormalizedForm()); + assertEquals("652", path.get(0).getNormalizedForm()); } @Test public void testComma() { plugin.enableNormalize = true; - List path = getPath("2,000,000"); + List path = getPath("2,000,000"); assertEquals(1, path.size()); - assertEquals("2000000", path.get(0).getWordInfo().getNormalizedForm()); + assertEquals("2000000", path.get(0).getNormalizedForm()); path = getPath("2,00,000,000円"); assertEquals(8, path.size()); - assertEquals("2", path.get(0).getWordInfo().getNormalizedForm()); - assertEquals(",", path.get(1).getWordInfo().getNormalizedForm()); - assertEquals("00", path.get(2).getWordInfo().getNormalizedForm()); - assertEquals(",", path.get(3).getWordInfo().getNormalizedForm()); - assertEquals("000", path.get(4).getWordInfo().getNormalizedForm()); - assertEquals(",", path.get(5).getWordInfo().getNormalizedForm()); - assertEquals("000", path.get(6).getWordInfo().getNormalizedForm()); + assertEquals("2", path.get(0).getNormalizedForm()); + assertEquals(",", path.get(1).getNormalizedForm()); + assertEquals("00", path.get(2).getNormalizedForm()); + assertEquals(",", path.get(3).getNormalizedForm()); + assertEquals("000", path.get(4).getNormalizedForm()); + assertEquals(",", path.get(5).getNormalizedForm()); + assertEquals("000", path.get(6).getNormalizedForm()); path = getPath(","); assertEquals(1, path.size()); path = getPath("652,,,"); assertEquals(4, path.size()); - assertEquals("652", path.get(0).getWordInfo().getNormalizedForm()); + assertEquals("652", path.get(0).getNormalizedForm()); path = getPath("256,5.50389"); assertEquals(3, path.size()); - assertEquals("256", path.get(0).getWordInfo().getNormalizedForm()); - assertEquals("5.50389", path.get(2).getWordInfo().getNormalizedForm()); + assertEquals("256", path.get(0).getNormalizedForm()); + assertEquals("5.50389", path.get(2).getNormalizedForm()); path = getPath("256,550.389"); assertEquals(1, path.size()); - assertEquals("256550.389", path.get(0).getWordInfo().getNormalizedForm()); + assertEquals("256550.389", path.get(0).getNormalizedForm()); } @Test public void testSingleNode() { plugin.enableNormalize = false; - List path = getPath("猫三匹"); + List path = getPath("猫三匹"); assertEquals(3, path.size()); - assertEquals("三", path.get(1).getWordInfo().getNormalizedForm()); + assertEquals("三", path.get(1).getNormalizedForm()); plugin.enableNormalize = true; path = getPath("猫三匹"); assertEquals(3, path.size()); - assertEquals("3", path.get(1).getWordInfo().getNormalizedForm()); + assertEquals("3", path.get(1).getNormalizedForm()); } - private List getPath(String text) { + private List getPath(String text) { UTF8InputText input = new UTF8InputTextBuilder(text, tokenizer.grammar).build(); LatticeImpl lattice = tokenizer.buildLattice(input); - List path = lattice.getBestPath(); + List path = lattice.getBestPath(); plugin.rewrite(input, path, lattice); lattice.clear(); return path; diff --git a/src/test/java/com/worksap/nlp/sudachi/MeCabOovProviderPluginTest.java b/src/test/java/com/worksap/nlp/sudachi/MeCabOovProviderPluginTest.java index 363ac368..092fcb69 100644 --- a/src/test/java/com/worksap/nlp/sudachi/MeCabOovProviderPluginTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/MeCabOovProviderPluginTest.java @@ -56,10 +56,8 @@ public List provideOOV(InputText inputText, int offset, boolean oth public void setUp() throws IOException { plugin = new TestPlugin(); - MeCabOovProviderPlugin.OOV oov1 = new MeCabOovProviderPlugin.OOV(); - oov1.posId = 1; - MeCabOovProviderPlugin.OOV oov2 = new MeCabOovProviderPlugin.OOV(); - oov2.posId = 2; + LatticeNodeImpl.OOVFactory oov1 = TestLattice.oovFactory(1); + LatticeNodeImpl.OOVFactory oov2 = TestLattice.oovFactory(2); plugin.oovList.put(CategoryType.KANJI, Collections.singletonList(oov1)); plugin.oovList.put(CategoryType.KANJINUMERIC, Arrays.asList(oov1, oov2)); @@ -406,10 +404,8 @@ public void readOOV() throws IOException { plugin.readOOV(oovConfig, new MockGrammar(), "forbid"); assertThat(plugin.oovList.size(), is(1)); assertThat(plugin.oovList.get(CategoryType.DEFAULT).size(), is(2)); - assertThat(plugin.oovList.get(CategoryType.DEFAULT).get(0).leftId, is((short) 1)); - assertThat(plugin.oovList.get(CategoryType.DEFAULT).get(0).rightId, is((short) 2)); - assertThat(plugin.oovList.get(CategoryType.DEFAULT).get(0).cost, is((short) 3)); - assertThat(plugin.oovList.get(CategoryType.DEFAULT).get(0).posId, is((short) 0)); + assertThat(plugin.oovList.get(CategoryType.DEFAULT).get(0), is(LatticeNodeImpl.oovFactory((short) 1, (short) 2, + (short) 3, (short) 0))); } @Test(expected = IllegalArgumentException.class) diff --git a/src/test/java/com/worksap/nlp/sudachi/TestDictionary.kt b/src/test/java/com/worksap/nlp/sudachi/TestDictionary.kt index 0eca2d0a..4ff41355 100644 --- a/src/test/java/com/worksap/nlp/sudachi/TestDictionary.kt +++ b/src/test/java/com/worksap/nlp/sudachi/TestDictionary.kt @@ -27,7 +27,7 @@ object TestDictionary { DicBuilder.system() .matrix(res("/dict/matrix.def")) .lexicon(res("/dict/lex.csv")) - .description("the system dictionary for the unit tests") + .comment("the system dictionary for the unit tests") .build(result) result } diff --git a/src/test/java/com/worksap/nlp/sudachi/TestLattice.java b/src/test/java/com/worksap/nlp/sudachi/TestLattice.java new file mode 100644 index 00000000..24a6bff3 --- /dev/null +++ b/src/test/java/com/worksap/nlp/sudachi/TestLattice.java @@ -0,0 +1,11 @@ +package com.worksap.nlp.sudachi; + +public class TestLattice { + public static LatticeNodeImpl.OOVFactory oovFactory(int leftId, int rightId, int cost, int posId) { + return LatticeNodeImpl.oovFactory((short) leftId, (short) rightId, (short) cost, (short) posId); + } + + public static LatticeNodeImpl.OOVFactory oovFactory(int posId) { + return oovFactory(0, 0, 0, posId); + } +} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/BufReaderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/BufReaderTest.kt index a6cfafae..9e67daec 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/BufReaderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/BufReaderTest.kt @@ -69,6 +69,7 @@ class BufReaderTest { checkLong(0xff_ffff_ffff_ffff) checkLong(0x4ff_ffff_ffff_ffff) checkLong(0xfff_ffff_ffff_ffff) + checkLong(0x1000_0000_0000_0000) checkLong(0x4fff_ffff_ffff_ffff) checkLong(0x5fff_ffff_ffff_ffff) checkLong(0x6fff_ffff_ffff_ffff) @@ -89,12 +90,16 @@ class BufReaderTest { checkInt(0xff) checkInt(0x4ff) checkInt(0xfff) + checkInt(0x1000) checkInt(0x4fff) checkInt(0xffff) + checkInt(0x1_0000) checkInt(0x4_ffff) checkInt(0xf_ffff) + checkInt(0x10_0000) checkInt(0x4f_ffff) checkInt(0xff_ffff) + checkInt(0x100_0000) checkInt(0x4ff_ffff) checkInt(0xfff_ffff) checkInt(0x4fff_ffff) @@ -111,4 +116,12 @@ class BufReaderTest { checkUtf8String("""👨‍👩‍👧‍👦""") checkUtf8String("""t東e京s💞t都""") } + + @Test + fun checkInts() { + val checkInt = check({ w, x -> w.putVarint32(x) }, { it.readVarint32() }) + for (i in 0..10000) { + checkInt(i) + } + } } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DescriptionTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DescriptionTest.kt new file mode 100644 index 00000000..4d8896fb --- /dev/null +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DescriptionTest.kt @@ -0,0 +1,29 @@ +package com.worksap.nlp.sudachi.dictionary + +import com.worksap.nlp.sudachi.dictionary.build.InMemoryChannel +import kotlin.test.Test +import kotlin.test.assertEquals + +class DescriptionTest { + @Test + fun serialization() { + val d = Description() + d.blocks = listOf(Description.Block("test", 5, 15), Description.Block("test2", 30, 25)) + d.reference = "testref" + d.comment = "コメント" + val chan = InMemoryChannel(4096) + d.save(chan) + chan.position(0) + val d2 = Description.load(chan) + assertEquals(d.comment, d2.comment) + assertEquals(d.reference, d2.reference) + assertEquals(d.signature, d2.signature) + assertEquals(d.blocks.size, d2.blocks.size) + assertEquals(d.blocks[0].name, d2.blocks[0].name) + assertEquals(d.blocks[0].start, d2.blocks[0].start) + assertEquals(d.blocks[0].size, d2.blocks[0].size) + assertEquals(d.blocks[1].name, d2.blocks[1].name) + assertEquals(d.blocks[1].start, d2.blocks[1].start) + assertEquals(d.blocks[1].size, d2.blocks[1].size) + } +} \ No newline at end of file diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.java index 0aaa098f..0a45c598 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.java @@ -58,9 +58,8 @@ public void commandLine() throws IOException { try (BinaryDictionary dictionary = new BinaryDictionary(outputFile.getPath())) { - DictionaryHeader header = dictionary.getDictionaryHeader(); - assertThat(header.getVersion(), is(DictionaryVersion.SYSTEM_DICT_VERSION_2)); - assertThat(header.getDescription(), is("test")); + Description header = dictionary.getDictionaryHeader(); + assertThat(header.getComment(), is("test")); Grammar grammar = dictionary.getGrammar(); assertThat(grammar.getPartOfSpeechSize(), is(2)); @@ -70,29 +69,31 @@ public void commandLine() throws IOException { Lexicon lexicon = dictionary.getLexicon(); assertThat(lexicon.size(), is(3)); + long params = lexicon.parameters(0); - assertThat(lexicon.getLeftId(0), is((short) 0)); - assertThat(lexicon.getCost(0), is((short) 0)); + assertThat(WordParameters.leftId(params), is((short) 0)); + assertThat(WordParameters.cost(params), is((short) 0)); WordInfo info = lexicon.getWordInfo(0); assertThat(info.getSurface(), is("東京都")); assertThat(info.getNormalizedForm(), is("東京都")); - assertThat(info.getDictionaryFormWordId(), is(-1)); + assertThat(info.getDictionaryForm(), is(-1)); assertThat(info.getReadingForm(), is("ヒガシキョウト")); assertThat(info.getPOSId(), is((short) 0)); assertThat(info.getAunitSplit(), is(new int[] { 1, 2 })); assertThat(info.getBunitSplit().length, is(0)); - assertThat(info.getSynonymGoupIds(), is(new int[] { 1, 2 })); + assertThat(info.getSynonymGroupIds(), is(new int[] { 1, 2 })); Iterator i = lexicon.lookup("東京都".getBytes(StandardCharsets.UTF_8), 0); assertTrue(i.hasNext()); assertThat(i.next(), is(new int[] { 0, "東京都".getBytes(StandardCharsets.UTF_8).length })); assertFalse(i.hasNext()); - assertThat(lexicon.getLeftId(1), is((short) -1)); - assertThat(lexicon.getCost(1), is((short) 0)); + params = lexicon.parameters(1); + assertThat(WordParameters.leftId(params), is((short) -1)); + assertThat(WordParameters.cost(params), is((short) 0)); info = lexicon.getWordInfo(1); assertThat(info.getSurface(), is("東")); assertThat(info.getNormalizedForm(), is("ひがし")); - assertThat(info.getDictionaryFormWordId(), is(-1)); + assertThat(info.getDictionaryForm(), is(-1)); assertThat(info.getReadingForm(), is("ヒガシ")); assertThat(info.getPOSId(), is((short) 1)); assertThat(info.getAunitSplit().length, is(0)); diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryReader.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryReader.java deleted file mode 100644 index 8b89aa4c..00000000 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryReader.java +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2021 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary; - -import java.io.IOException; -import java.io.InputStream; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.ArrayList; - -class DictionaryReader { - - static ByteBuffer read(String filename) throws IOException { - InputStream input = DictionaryReader.class.getResourceAsStream(filename); - ArrayList buffer = new ArrayList<>(); - for (int c = input.read(); c >= 0; c = input.read()) { - buffer.add((byte) c); - } - ByteBuffer bytes = ByteBuffer.allocate(buffer.size()); - bytes.order(ByteOrder.LITTLE_ENDIAN); - for (Byte b : buffer) { - bytes.put(b); - } - bytes.rewind(); - - return bytes; - } -} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.java index 151b513d..aac3071b 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.java @@ -31,7 +31,7 @@ import org.junit.Test; public class DoubleArrayLexiconTest { - +/* static final int GRAMMAR_SIZE = 470; DoubleArrayLexicon lexicon; @@ -87,7 +87,7 @@ public void wordInfo() { assertEquals(3, wi.getLength()); assertEquals(0, wi.getPOSId()); assertEquals("た", wi.getNormalizedForm()); - assertEquals(-1, wi.getDictionaryFormWordId()); + assertEquals(-1, wi.getDictionaryForm()); assertEquals("た", wi.getDictionaryForm()); assertEquals("タ", wi.getReadingForm()); assertArrayEquals(new int[0], wi.getAunitSplit()); @@ -98,7 +98,7 @@ public void wordInfo() { wi = lexicon.getWordInfo(8); assertEquals("行っ", wi.getSurface()); assertEquals("行く", wi.getNormalizedForm()); - assertEquals(7, wi.getDictionaryFormWordId()); + assertEquals(7, wi.getDictionaryForm()); assertEquals("行く", wi.getDictionaryForm()); // 東京都 @@ -107,7 +107,7 @@ public void wordInfo() { assertArrayEquals(new int[] { 5, 9 }, wi.getAunitSplit()); assertArrayEquals(new int[0], wi.getBunitSplit()); assertArrayEquals(new int[] { 5, 9 }, wi.getWordStructure()); - assertArrayEquals(new int[0], wi.getSynonymGoupIds()); + assertArrayEquals(new int[0], wi.getSynonymGroupIds()); } @Test @@ -117,7 +117,7 @@ public void wordInfoWithLongWord() { assertEquals(300, wi.getSurface().length()); assertEquals(300, wi.getLength()); assertEquals(300, wi.getNormalizedForm().length()); - assertEquals(-1, wi.getDictionaryFormWordId()); + assertEquals(-1, wi.getDictionaryForm()); assertEquals(300, wi.getDictionaryForm().length()); assertEquals(570, wi.getReadingForm().length()); } @@ -134,4 +134,5 @@ static List iteratorToList(Iterator iterator) { } return result; } + */ } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilderTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilderTest.java index 48519f9c..811fc860 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilderTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilderTest.java @@ -62,19 +62,19 @@ public void commandLine() throws IOException { "test", inputFile.getPath() }); try (BinaryDictionary dictionary = new BinaryDictionary(outputFile.getPath())) { - DictionaryHeader header = dictionary.getDictionaryHeader(); - assertThat(header.getVersion(), is(DictionaryVersion.USER_DICT_VERSION_3)); - assertThat(header.getDescription(), is("test")); + Description header = dictionary.getDictionaryHeader(); + assertThat(header.getComment(), is("test")); Lexicon lexicon = dictionary.getLexicon(); assertThat(lexicon.size(), is(2)); - assertThat(lexicon.getLeftId(0), is((short) 0)); - assertThat(lexicon.getCost(0), is((short) 0)); + long param = lexicon.parameters(0); + assertThat(WordParameters.leftId(param), is((short) 0)); + assertThat(WordParameters.cost(param), is((short) 0)); WordInfo info = lexicon.getWordInfo(0); assertThat(info.getSurface(), is("東京都市")); assertThat(info.getNormalizedForm(), is("東京都市")); - assertThat(info.getDictionaryFormWordId(), is(-1)); + assertThat(info.getDictionaryForm(), is(-1)); assertThat(info.getReadingForm(), is("ヒガシキョウトシ")); assertThat(info.getPOSId(), is((short) 3)); assertThat(info.getAunitSplit(), is(new int[] { 4, 3, 1 | (1 << 28) })); @@ -85,12 +85,13 @@ public void commandLine() throws IOException { assertThat(i.next(), is(new int[] { 0, "東京都市".getBytes(StandardCharsets.UTF_8).length })); assertFalse(i.hasNext()); - assertThat(lexicon.getLeftId(1), is((short) -1)); - assertThat(lexicon.getCost(1), is((short) 0)); + param = lexicon.parameters(1); + assertThat(WordParameters.leftId(param), is((short) -1)); + assertThat(WordParameters.cost(param), is((short) 0)); info = lexicon.getWordInfo(1); assertThat(info.getSurface(), is("市")); assertThat(info.getNormalizedForm(), is("市")); - assertThat(info.getDictionaryFormWordId(), is(-1)); + assertThat(info.getDictionaryForm(), is(-1)); assertThat(info.getReadingForm(), is("シ")); assertThat(info.getPOSId(), is((short) 4)); assertThat(info.getAunitSplit().length, is(0)); diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/SystemDicTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/SystemDicTest.kt index dfe5ec35..2650248c 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/SystemDicTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/SystemDicTest.kt @@ -18,9 +18,19 @@ package com.worksap.nlp.sudachi.dictionary.build import com.worksap.nlp.sudachi.dictionary.BinaryDictionary import com.worksap.nlp.sudachi.dictionary.POS +import com.worksap.nlp.sudachi.morpheme import com.worksap.nlp.sudachi.res +import com.worksap.nlp.sudachi.wordInfo import kotlin.test.* +fun DicBuilder.System.lexicon(s: String): DicBuilder.System { + return this.lexicon("test", {s.byteInputStream()}, s.length.toLong()) +} + +fun DicBuilder.User.lexicon(s: String): DicBuilder.User { + return this.lexicon("test", {s.byteInputStream()}, s.length.toLong()) +} + class SystemDicTest { @Test fun simple() { @@ -40,49 +50,48 @@ class SystemDicTest { val data = MemChannel() repeat(10) { bldr.lexicon(javaClass.getResource("one.csv")) } bldr - .lexicon("南,1,1,4675,南,名詞,普通名詞,一般,*,*,*,ミナミ,西,5,C,0/1,2/3,4/5,6/7".byteInputStream()) + .lexicon("南,1,1,4675,南,名詞,普通名詞,一般,*,*,*,ミナミ,西,5,C,0/1,2/3,4/5,6/7") .build(data) val dic = BinaryDictionary(data.buffer()) assertEquals(11, dic.lexicon.size()) assertEquals(POS("名詞", "普通名詞", "一般", "*", "*", "*"), dic.grammar.getPartOfSpeechString(0)) - val wi = dic.lexicon.getWordInfo(10) - assertEquals(wi.surface, "南") + val m = dic.morpheme(10) + val wi = m.wordInfo + assertEquals(m.surface(), "南") assertEquals(wi.length, 3) assertEquals(wi.posId, 0) - assertEquals(wi.dictionaryFormWordId, 5) - assertEquals(wi.dictionaryForm, "東") - assertEquals(wi.normalizedForm, "西") - assertEquals(wi.readingForm, "ミナミ") + assertEquals(m.dictionaryForm(), "南") + assertEquals(m.normalizedForm(), "西") + assertEquals(m.readingForm(), "ミナミ") assertContentEquals(wi.aunitSplit, intArrayOf(0, 1)) assertContentEquals(wi.bunitSplit, intArrayOf(2, 3)) assertContentEquals(wi.wordStructure, intArrayOf(4, 5)) - assertContentEquals(wi.synonymGoupIds, intArrayOf(6, 7)) + assertContentEquals(m.synonymGroupIds, intArrayOf(6, 7)) } @Test fun fieldsCompressed() { val bldr = DicBuilder.system().matrix(javaClass.getResource("test.matrix")) val data = MemChannel() - bldr.lexicon("南,1,1,4675,南,名詞,普通名詞,一般,*,*,*,南,南,*,C,*,*,*,*".byteInputStream()).build(data) + bldr.lexicon("南,1,1,4675,南,名詞,普通名詞,一般,*,*,*,南,南,*,C,*,*,*,*").build(data) val dic = BinaryDictionary(data.buffer()) assertEquals(1, dic.lexicon.size()) assertEquals(POS("名詞", "普通名詞", "一般", "*", "*", "*"), dic.grammar.getPartOfSpeechString(0)) - val wi = dic.lexicon.getWordInfo(0) - assertEquals(wi.surface, "南") - assertEquals(wi.dictionaryFormWordId, -1) - assertEquals(wi.dictionaryForm, "南") - assertEquals(wi.normalizedForm, "南") - assertEquals(wi.readingForm, "南") + val m = dic.morpheme(0) + assertEquals(m.surface(), "南") + assertEquals(m.dictionaryForm(), "南") + assertEquals(m.normalizedForm(), "南") + assertEquals(m.readingForm(), "南") } @Test fun failMatrixSizeValidation() { val bldr = DicBuilder.system().matrix(res("test.matrix")) assertFails { - bldr.lexicon("東,4,1,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,*,A,*,*,*,*".byteInputStream()) + bldr.lexicon("東,4,1,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,*,A,*,*,*,*") } assertFails { - bldr.lexicon("東,1,4,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,*,A,*,*,*,*".byteInputStream()) + bldr.lexicon("東,1,4,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,*,A,*,*,*,*") } } @@ -95,8 +104,7 @@ class SystemDicTest { """東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* 東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,0/2,*,0/2,* 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""" - .trimIndent() - .byteInputStream()) + .trimIndent()) .build(data) val dic = BinaryDictionary(data.buffer()) assertEquals(3, dic.lexicon.size()) @@ -114,8 +122,7 @@ class SystemDicTest { """東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* 東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/2",*,0/2,* 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""" - .trimIndent() - .byteInputStream()) + .trimIndent()) .build(data) val dic = BinaryDictionary(data.buffer()) assertEquals(3, dic.lexicon.size()) @@ -133,8 +140,7 @@ class SystemDicTest { """東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* 東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,0/2,0/2,* 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""" - .trimIndent() - .byteInputStream()) + .trimIndent()) .build(data) val dic = BinaryDictionary(data.buffer()) assertEquals(3, dic.lexicon.size()) @@ -152,8 +158,7 @@ class SystemDicTest { """東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* 東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,0/2,U0/U2,* 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""" - .trimIndent() - .byteInputStream()) + .trimIndent()) .build(data) val dic = BinaryDictionary(data.buffer()) assertEquals(3, dic.lexicon.size()) @@ -165,7 +170,7 @@ class SystemDicTest { @Test fun failSplitBoundsCheck() { val bldr = DicBuilder.system().matrix(res("test.matrix")) - bldr.lexicon("""東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,1,*,*""".byteInputStream()) + bldr.lexicon("""東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,1,*,*""") assertFails { bldr.build(MemChannel()) } } @@ -173,7 +178,7 @@ class SystemDicTest { fun failInvalidNumberOfInlineRefFields() { val bldr = DicBuilder.system().matrix(res("test.matrix")) bldr.lexicon( - """東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,"a,b,c,d,e",*,*""".byteInputStream()) + """東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,"a,b,c,d,e",*,*""") assertFails { bldr.build(MemChannel()) } } @@ -183,8 +188,7 @@ class SystemDicTest { bldr.lexicon( """東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* 東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,"東京,名詞,固有名詞,地名,一般,*,*,a",*,*""" - .trimMargin() - .byteInputStream()) + .trimMargin()) assertFails { bldr.build(MemChannel()) } } @@ -197,7 +201,7 @@ class SystemDicTest { val read = "b".repeat(1024) + istr val norm = "c".repeat(1024) + istr bldr.lexicon( - "$surf,1,1,2816,$surf,名詞,固有名詞,地名,一般,*,*,$read,$norm,*,A,*,*,*,*".byteInputStream()) + "$surf,1,1,2816,$surf,名詞,固有名詞,地名,一般,*,*,$read,$norm,*,A,*,*,*,*") } val ch = MemChannel() bldr.build(ch) @@ -215,10 +219,10 @@ class SystemDicTest { assertContentEquals(intArrayOf(i, surfArray.size), iter.next()) assertFalse { iter.hasNext() } - val wi = dic.lexicon.getWordInfo(i) - assertEquals(wi.surface, surf) - assertEquals(wi.readingForm, read) - assertEquals(wi.normalizedForm, norm) + val wi = dic.morpheme(i) + assertEquals(wi.surface(), surf) + assertEquals(wi.readingForm(), read) + assertEquals(wi.normalizedForm(), norm) } } } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt index d2918077..8be4a3ef 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt @@ -38,7 +38,7 @@ class TestDic { } fun system(data: String): TestDic { - val bldr = DicBuilder.system().matrix(matrixUrl).lexicon(data.byteInputStream()) + val bldr = DicBuilder.system().matrix(matrixUrl).lexicon(data) val ch = MemChannel() bldr.build(ch) this.systemDic = BinaryDictionary(ch.buffer()) @@ -46,7 +46,7 @@ class TestDic { } fun user(data: String): TestDic { - val bldr = DicBuilder.user(systemDic).lexicon(data.byteInputStream()) + val bldr = DicBuilder.user(systemDic).lexicon(data) val ch = MemChannel() bldr.build(ch) this.userDics.add(BinaryDictionary(ch.buffer())) @@ -72,10 +72,10 @@ class UserDicTest { .load() val da = dic as DictionaryAccess - val wi = da.lexicon.getWordInfo(WordId.make(1, 0)) - assertEquals(dic.partOfSpeechSize, 2) - assertEquals(wi.surface, "東京都") - assertEquals(wi.readingForm, "トウキョウト") + assertEquals(dic.partOfSpeechSize, 2) + val wi = da.morpheme(WordId.make(1, 0)) + assertEquals(wi.surface(), "東京都") + assertEquals(wi.readingForm(), "トウキョウト") } @Test @@ -133,11 +133,10 @@ class UserDicTest { .load() val da = dic as DictionaryAccess - val wi = da.lexicon.getWordInfo(WordId.make(1, 0)) + val wi = da.morpheme(WordId.make(1, 0)) assertEquals(dic.partOfSpeechSize, 3) - assertEquals(wi.surface, "東京都") - assertEquals(wi.posId, 2) - assertEquals(da.grammar.getPartOfSpeechString(2), POS("a", "b", "c", "d", "e", "f")) + assertEquals(wi.surface(), "東京都") + assertEquals(wi.partOfSpeech(), "a,b,c,d,e,f".pos) } @Test diff --git a/src/test/java/com/worksap/nlp/sudachi/morphemes.kt b/src/test/java/com/worksap/nlp/sudachi/morphemes.kt new file mode 100644 index 00000000..6535d18e --- /dev/null +++ b/src/test/java/com/worksap/nlp/sudachi/morphemes.kt @@ -0,0 +1,27 @@ +package com.worksap.nlp.sudachi + +import com.worksap.nlp.sudachi.dictionary.DictionaryAccess +import com.worksap.nlp.sudachi.dictionary.Lexicon +import com.worksap.nlp.sudachi.dictionary.POS +import com.worksap.nlp.sudachi.dictionary.WordInfo + + +fun DictionaryAccess.morpheme(id: Int): Morpheme { + val node = LatticeNodeImpl(lexicon, 0, id) + + val l = MorphemeList( + UTF8InputTextBuilder(node.baseSurface, grammar).build(), + grammar, + lexicon, + listOf(node), + false, + Tokenizer.SplitMode.A + ) + return l[0] +} + +val Morpheme.wordInfo: WordInfo + get() = (this as MorphemeImpl).wordInfo + +val String.pos: POS + get() = POS(this.split(",")) \ No newline at end of file From 31496e2efef820cf40cbf027b13138118f5bb969 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 16 Jul 2024 18:00:08 +0900 Subject: [PATCH 18/94] spotless apply --- .../com/worksap/nlp/sudachi/LatticeImpl.java | 2 +- .../worksap/nlp/sudachi/LatticeNodeImpl.java | 6 +- .../com/worksap/nlp/sudachi/MorphemeImpl.java | 4 +- .../com/worksap/nlp/sudachi/MorphemeList.java | 6 +- .../nlp/sudachi/PathRewritePlugin.java | 17 +- .../java/com/worksap/nlp/sudachi/WordId.java | 1 + .../sudachi/dictionary/CompactedStrings.java | 16 ++ .../nlp/sudachi/dictionary/Description.java | 36 ++-- .../sudachi/dictionary/DictionaryPrinter.java | 16 +- .../dictionary/DoubleArrayLexicon.java | 13 +- .../worksap/nlp/sudachi/dictionary/Ints.java | 5 +- .../nlp/sudachi/dictionary/Lexicon.java | 15 +- .../nlp/sudachi/dictionary/WordIdTable.java | 11 +- .../nlp/sudachi/dictionary/WordInfo.java | 15 +- .../sudachi/dictionary/WordParameters.java | 16 ++ .../sudachi/dictionary/build/BlockLayout.java | 2 +- .../sudachi/dictionary/build/DicBuilder.java | 110 +++++++---- .../sudachi/MeCabOovProviderPluginTest.java | 4 +- .../com/worksap/nlp/sudachi/TestLattice.java | 16 ++ .../nlp/sudachi/dictionary/DescriptionTest.kt | 60 +++--- .../dictionary/DoubleArrayLexiconTest.java | 173 +++++++----------- .../sudachi/dictionary/build/SystemDicTest.kt | 37 ++-- .../sudachi/dictionary/build/UserDicTest.kt | 5 +- .../java/com/worksap/nlp/sudachi/morphemes.kt | 42 +++-- 24 files changed, 358 insertions(+), 270 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/LatticeImpl.java b/src/main/java/com/worksap/nlp/sudachi/LatticeImpl.java index 6eda4c0c..13d5c786 100644 --- a/src/main/java/com/worksap/nlp/sudachi/LatticeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/LatticeImpl.java @@ -97,7 +97,7 @@ public List getNodes(int begin, int end) { public LatticeNodeImpl getMinimumNode(int begin, int end) { ArrayList ends = endLists.get(end); LatticeNodeImpl result = null; - for (LatticeNodeImpl node: ends) { + for (LatticeNodeImpl node : ends) { if (node.begin == begin) { if (result == null || result.totalCost >= node.cost) { result = node; diff --git a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java index c06b40ff..e4217d07 100644 --- a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java @@ -70,7 +70,7 @@ private Lexicon lexicon() { if (lexicon instanceof Lexicon) { return (Lexicon) lexicon; } else if (lexicon instanceof StringsCache) { - return ((StringsCache)lexicon).lexicon; + return ((StringsCache) lexicon).lexicon; } else { throw new IllegalStateException("lexicon was null probably"); } @@ -349,8 +349,8 @@ public boolean equals(Object o) { if (o == null || getClass() != o.getClass()) return false; OOVFactory that = (OOVFactory) o; - return leftId == that.leftId && rightId == that.rightId && cost == that.cost && posId == that.posId && Objects.equals( - wordInfo, that.wordInfo); + return leftId == that.leftId && rightId == that.rightId && cost == that.cost && posId == that.posId + && Objects.equals(wordInfo, that.wordInfo); } @Override diff --git a/src/main/java/com/worksap/nlp/sudachi/MorphemeImpl.java b/src/main/java/com/worksap/nlp/sudachi/MorphemeImpl.java index fff1b40f..1cbf1849 100644 --- a/src/main/java/com/worksap/nlp/sudachi/MorphemeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/MorphemeImpl.java @@ -26,7 +26,7 @@ class MorphemeImpl implements Morpheme { private final int index; private LatticeNodeImpl node; - /*internal*/ MorphemeImpl(MorphemeList list, int index) { + /* internal */ MorphemeImpl(MorphemeList list, int index) { this.list = list; this.index = index; } @@ -125,7 +125,7 @@ public String toString() { return sb.toString(); } - /*internal*/ boolean isCompatible(JapaneseDictionary dictionary) { + /* internal */ boolean isCompatible(JapaneseDictionary dictionary) { return dictionary.grammar == this.list.grammar; } } diff --git a/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java b/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java index 8fdd815c..590da60a 100644 --- a/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java +++ b/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java @@ -37,8 +37,8 @@ public class MorphemeList extends AbstractList { public final static MorphemeList EMPTY = new MorphemeList(null, null, null, Collections.emptyList(), true, Tokenizer.SplitMode.C); - MorphemeList(InputText input, Grammar grammar, Lexicon lexicon, List path, boolean allowEmptyMorpheme, - Tokenizer.SplitMode mode) { + MorphemeList(InputText input, Grammar grammar, Lexicon lexicon, List path, + boolean allowEmptyMorpheme, Tokenizer.SplitMode mode) { this.inputText = input; this.grammar = grammar; this.lexicon = lexicon; @@ -136,7 +136,7 @@ public int getInternalCost() { return p.get(p.size() - 1).getPathCost() - p.get(0).getPathCost(); } - /* internal*/ LatticeNodeImpl node(int index) { + /* internal */ LatticeNodeImpl node(int index) { return path.get(index); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/PathRewritePlugin.java b/src/main/java/com/worksap/nlp/sudachi/PathRewritePlugin.java index 891fcfe2..161b2889 100644 --- a/src/main/java/com/worksap/nlp/sudachi/PathRewritePlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/PathRewritePlugin.java @@ -96,7 +96,8 @@ public void setUp(Grammar grammar) throws IOException { * length of the sequence, or {@code begin} equals or is greater * than {@code end} */ - public LatticeNode concatenate(List path, int begin, int end, Lattice lattice, String normalizedForm) { + public LatticeNode concatenate(List path, int begin, int end, Lattice lattice, + String normalizedForm) { if (begin >= end) { throw new IndexOutOfBoundsException("begin >= end"); } @@ -118,14 +119,9 @@ public LatticeNode concatenate(List path, int begin, int end, L } String s = surface.toString(); - LatticeNodeImpl node = LatticeNodeImpl.makeOov( - b, e, - posId, - s, - (normalizedForm == null) ? normalizedFormBuilder.toString() : normalizedForm, - dictionaryForm.toString(), - readingForm.toString() - ); + LatticeNodeImpl node = LatticeNodeImpl.makeOov(b, e, posId, s, + (normalizedForm == null) ? normalizedFormBuilder.toString() : normalizedForm, dictionaryForm.toString(), + readingForm.toString()); replaceNode(path, begin, end, node); return node; } @@ -153,7 +149,8 @@ public LatticeNode concatenate(List path, int begin, int end, L * length of the sequence, or {@code begin} equals or is greater * than {@code end} */ - public LatticeNode concatenateOov(List path, int begin, int end, LatticeNodeImpl.OOVFactory factory, Lattice lattice) { + public LatticeNode concatenateOov(List path, int begin, int end, + LatticeNodeImpl.OOVFactory factory, Lattice lattice) { if (begin >= end) { throw new IndexOutOfBoundsException("begin >= end"); } diff --git a/src/main/java/com/worksap/nlp/sudachi/WordId.java b/src/main/java/com/worksap/nlp/sudachi/WordId.java index e2d41543..4dc40164 100644 --- a/src/main/java/com/worksap/nlp/sudachi/WordId.java +++ b/src/main/java/com/worksap/nlp/sudachi/WordId.java @@ -97,6 +97,7 @@ public static boolean isOov(int wordId) { // low 16 bits are OOV POS, top 4 are 1s return (wordId & 0xffff_0000) == 0xf000_0000; } + public static boolean isSpecial(int wordId) { // top 5 bits should be filled return (wordId & 0xf800_0000) == 0xf800_0000; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/CompactedStrings.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/CompactedStrings.java index e7cd70ce..ddcaea43 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/CompactedStrings.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/CompactedStrings.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2024 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi.dictionary; import java.nio.CharBuffer; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java index 100abbc2..947b3eb5 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java @@ -33,8 +33,8 @@ import java.util.Random; /** - * Description of the dictionary blocks, in-memory representation. - * Basically, an extended version of the dictionary header. + * Description of the dictionary blocks, in-memory representation. Basically, an + * extended version of the dictionary header. */ public class Description { private Instant creationTime = Instant.now(); @@ -48,10 +48,15 @@ public class Description { /** * Return a slice of the full dictionary with the provided name - * @param full ByteBuffer which represents the whole dictionary loaded into memory - * @param part name of the required part + * + * @param full + * ByteBuffer which represents the whole dictionary loaded into + * memory + * @param part + * name of the required part * @return slice of the ByteBuffer - * @throws IllegalArgumentException if the part with the provided name was not found + * @throws IllegalArgumentException + * if the part with the provided name was not found */ public ByteBuffer slice(ByteBuffer full, String part) { ByteBuffer slice = sliceOrNull(full, part); @@ -63,15 +68,19 @@ public ByteBuffer slice(ByteBuffer full, String part) { /** * Return a slice of the full dictionary with the provided name - * @param full ByteBuffer which represents the whole dictionary loaded into memory - * @param part name of the required part + * + * @param full + * ByteBuffer which represents the whole dictionary loaded into + * memory + * @param part + * name of the required part * @return slice of the ByteBuffer or null if not found */ public ByteBuffer sliceOrNull(ByteBuffer full, String part) { - for (Block b: blocks) { + for (Block b : blocks) { if (b.name.equals(part)) { - int start = (int)b.start; - int end = (int)(b.start + b.size); + int start = (int) b.start; + int end = (int) (b.start + b.size); int position = full.position(); int limit = full.limit(); full.position(start); @@ -215,7 +224,8 @@ private static void checkLegacyDictionaryFormat(ByteBuffer raw) { private String defaultSignature(Instant date) { DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMMddHHmmss", Locale.US); - return String.format("%s-%08x", formatter.format(LocalDateTime.ofInstant(date, ZoneId.systemDefault())), new Random().nextLong()); + return String.format("%s-%08x", formatter.format(LocalDateTime.ofInstant(date, ZoneId.systemDefault())), + new Random().nextLong()); } public Instant getCreationTime() { @@ -227,7 +237,9 @@ public void setCompilationTime(Instant creationTime) { } @Deprecated - public String getDescription() { return getComment(); } + public String getDescription() { + return getComment(); + } public String getComment() { return comment; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index 172a8842..5d7fdc60 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -51,8 +51,10 @@ private DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictio lex = dic.getLexicon(); - // in order to output dictionary entries in in-dictionary order we need to sort them - // iterator over them will get them not in the sorted order, but grouped by surface (and sorted in groups) + // in order to output dictionary entries in in-dictionary order we need to sort + // them + // iterator over them will get them not in the sorted order, but grouped by + // surface (and sorted in groups) Ints allIds = new Ints(lex.size()); Iterator ids = lex.wordIds(); while (ids.hasNext()) { @@ -144,8 +146,8 @@ private String maybeQuoteField(String value) { } private String maybeQuoteRefPart(String value) { - if (value.indexOf(',') != -1 || value.indexOf('"') != -1 || value.indexOf('-') != -1 || value.indexOf( - '/') != -1) { + if (value.indexOf(',') != -1 || value.indexOf('"') != -1 || value.indexOf('-') != -1 + || value.indexOf('/') != -1) { return fullEscape(value); } return value; @@ -187,8 +189,6 @@ static void printDictionary(String filename, BinaryDictionary systemDict, PrintS } } - - static char getUnitType(WordInfo info) { if (info.getAunitSplit().length == 0) { return 'A'; @@ -224,9 +224,9 @@ static String splitToString(int[] split) { * This tool requires the system dictionary when it dumps an user dictionary. * * @param args - * the option and the input filename + * the option and the input filename * @throws IOException - * if IO + * if IO */ public static void main(String[] args) throws IOException { BinaryDictionary systemDict = null; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java index 720c6d5e..35e37ce2 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java @@ -33,9 +33,8 @@ public class DoubleArrayLexicon implements Lexicon { private final WordIdTable wordIdTable; private final CompactedStrings strings; - - public DoubleArrayLexicon(Description description, WordIdTable wordIdTable, WordParameters wordParams, WordInfoList wordInfos, - DoubleArray trie, CompactedStrings strings) { + public DoubleArrayLexicon(Description description, WordIdTable wordIdTable, WordParameters wordParams, + WordInfoList wordInfos, DoubleArray trie, CompactedStrings strings) { this.description = description; this.wordIdTable = wordIdTable; this.parameters = wordParams; @@ -155,9 +154,11 @@ public Iterator wordIds() { /** * Returns true if the cost value is a normal value which can be used as is. - * Otherwise, it is a placeholder which needs to be recalculated - * based on the content of the dictionary. - * @param cost raw cost value + * Otherwise, it is a placeholder which needs to be recalculated based on the + * content of the dictionary. + * + * @param cost + * raw cost value * @return true a normal cost value */ public static boolean isNormalCost(short cost) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java index 9f498353..f441c46a 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java @@ -21,8 +21,9 @@ import java.util.StringJoiner; /** - * Internal class for dealing with resizable integer arrays without boxing or double indirection. - * This class is not a part of Sudachi API and can be changed. + * Internal class for dealing with resizable integer arrays without boxing or + * double indirection. This class is not a part of Sudachi API and can be + * changed. */ public class Ints { private int[] data; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java index 2a7bccd2..edc60c52 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java @@ -28,9 +28,11 @@ public interface Lexicon { Iterator lookup(byte[] text, int offset); /** - * Return packed parameters for the morpheme with the given id. - * Parameters are leftId, rightId, cost packed in a single long value. - * @param wordId id of word to extract parameters + * Return packed parameters for the morpheme with the given id. Parameters are + * leftId, rightId, cost packed in a single long value. + * + * @param wordId + * id of word to extract parameters * @return long value of packed parameters */ long parameters(int wordId); @@ -57,8 +59,11 @@ public interface Lexicon { /** * Get the string with the given packed string pointer from the dictionary - * @param dic dictionary id - * @param stringPtr packed string pointer + * + * @param dic + * dictionary id + * @param stringPtr + * packed string pointer * @return String object value, copy of the in-memory representation * @see WordId#dic(int) */ diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java index 3c2778cd..e166755c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java @@ -78,16 +78,19 @@ void setDictionaryId(int id) { } /** - * Iterates over all valid word ids in the dictionary. - * Iteration order is not the same as the original dictionary order, but dictionary ids, when sorted, form the correct order. - *
- * The returned Ints object will be the same for each invocation of {@code next()}. + * Iterates over all valid word ids in the dictionary. Iteration order is not + * the same as the original dictionary order, but dictionary ids, when sorted, + * form the correct order.
+ * The returned Ints object will be the same for each invocation of + * {@code next()}. + * * @return iterator object */ public Iterator wordIds() { return new Iterator() { private final BufReader buf = new BufReader(bytes.duplicate()); private final Ints ints = new Ints(16); + @Override public boolean hasNext() { return buf.remaining() > 0; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java index c703bd3a..42beabf0 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java @@ -58,8 +58,8 @@ public WordInfo(short headwordLength, short posId, int surface, int reading, int } /** - * Allocates morpheme information for ones not in the lexicon. - * For example, OOVs. + * Allocates morpheme information for ones not in the lexicon. For example, + * OOVs. * * @param headwordLength * the length of the morpheme @@ -105,9 +105,8 @@ public short getLength() { } /** - * Returns the part-of-speech ID of the morpheme. - * The strings of part-of-speech name can be gotten with - * {@link Grammar#getPartOfSpeechString}. + * Returns the part-of-speech ID of the morpheme. The strings of part-of-speech + * name can be gotten with {@link Grammar#getPartOfSpeechString}. * * @return the POS ID */ @@ -135,9 +134,8 @@ public int getNormalizedForm() { } /** - * Returns the word ID of the dictionary form of the morpheme. - * The information of the dictionary form can be gotten with - * {@link Lexicon#getWordInfo} + * Returns the word ID of the dictionary form of the morpheme. The information + * of the dictionary form can be gotten with {@link Lexicon#getWordInfo} * * @return the word ID of the dictionary form of the morpheme */ @@ -215,7 +213,6 @@ public static int readingForm(ByteBuffer buffer, int pos) { return buffer.getInt(pos + 12); } - private WordInfo(ByteBuffer buffer, int pos) { // short leftId = buffer.getShort(pos); // short rightId = buffer.getShort(pos + 2); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameters.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameters.java index 6025ca04..f092cd1e 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameters.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameters.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2024 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi.dictionary; import java.nio.ByteBuffer; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java index 07a79a6a..c754d52c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java @@ -48,7 +48,7 @@ public T block(String name, BlockHandler handler) throws IOException { public List blocks() { List result = new ArrayList<>(); - for (BlockInfo b: info) { + for (BlockInfo b : info) { Description.Block published = new Description.Block(b.name, b.start, b.end - b.start); result.add(published); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java index 2444f0c0..d5d00cb2 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java @@ -35,8 +35,8 @@ import static java.lang.System.nanoTime; /** - * Fluid API for building a binary dictionary from a CSV file. - * See documentation for the format of the CSV dictionary. + * Fluid API for building a binary dictionary from a CSV file. See documentation + * for the format of the CSV dictionary. */ public class DicBuilder { private DicBuilder() { @@ -58,11 +58,17 @@ private T self() { /** * Import words from the csv lexicon into the binary dictionary compiler. * - * @param name name of input file - * @param input factory for the InputStream with the lexicon content. May be called several times. - * @param size total size of the file in bytes. Used for reporting progress and can be not very precise. - * @return current object - * @throws IOException when IO fails + * @param name + * name of input file + * @param input + * factory for the InputStream with the lexicon content. May be + * called several times. + * @param size + * total size of the file in bytes. Used for reporting progress and + * can be not very precise. + * @return current object + * @throws IOException + * when IO fails */ public T lexicon(String name, IOSupplier input, long size) throws IOException { progress.startBlock(name, nanoTime(), Progress.Kind.INPUT); @@ -75,12 +81,15 @@ public T lexicon(String name, IOSupplier input, long size) throws I } /** - * Import words from the csv lexicon into the binary dictionary compiler. - * This method is for loading resources from classpath mostly, remote access is untested. + * Import words from the csv lexicon into the binary dictionary compiler. This + * method is for loading resources from classpath mostly, remote access is + * untested. * - * @param url pointing to the + * @param url + * pointing to the * @return current object - * @throws IOException when IO fails + * @throws IOException + * when IO fails * @see Class#getResource(String) * @see ClassLoader#getResource(String) */ @@ -94,9 +103,11 @@ public T lexicon(URL url) throws IOException { /** * Import words from the csv lexicon into the binary dictionary compiler. * - * @param path csv file + * @param path + * csv file * @return current object - * @throws IOException when IO fails + * @throws IOException + * when IO fails */ public T lexicon(Path path) throws IOException { String name = path.getFileName().toString(); @@ -106,7 +117,9 @@ public T lexicon(Path path) throws IOException { /** * Set the progress handler to the provided one - * @param progress handler + * + * @param progress + * handler * @return current object */ public T progress(Progress progress) { @@ -116,7 +129,9 @@ public T progress(Progress progress) { /** * Set the comment string in the binary dictionary - * @param comment provided string + * + * @param comment + * provided string * @return current object */ public T comment(String comment) { @@ -126,7 +141,9 @@ public T comment(String comment) { /** * Set the dictionary compilation time - * @param instant time to set + * + * @param instant + * time to set * @return current object */ public T compilationTime(Instant instant) { @@ -136,8 +153,11 @@ public T compilationTime(Instant instant) { /** * Compile the binary dictionary and write it to the proviced channel - * @param channel contents will be written here - * @throws IOException if io fails + * + * @param channel + * contents will be written here + * @throws IOException + * if io fails */ public void build(SeekableByteChannel channel) throws IOException { BlockLayout layout = new BlockLayout(channel, progress); @@ -165,9 +185,11 @@ private System readMatrix(String name, IOSupplier input, long size) } /** - * Set the system dictionary signature to the provided string. - * By default, it is current timestamp and a random 8 hexadecimal characters. - * @param signature provided dictionary signature. Can not be empty. + * Set the system dictionary signature to the provided string. By default, it is + * current timestamp and a random 8 hexadecimal characters. + * + * @param signature + * provided dictionary signature. Can not be empty. * @return current object */ public System signature(String signature) { @@ -183,7 +205,8 @@ public System signature(String signature) { } /** - * Typestate pattern for system dictionary that does not have connection matrix added yet + * Typestate pattern for system dictionary that does not have connection matrix + * added yet */ public static final class SystemNoMatrix { private final System inner; @@ -194,21 +217,32 @@ private SystemNoMatrix(DicBuilder.System inner) { /** * Read connection matrix from MeCab matrix.def format text file. - * @param name name of the file - * @param data factory for the InputStream which contains the file. This can be called more than once. - * @param size total number of bytes for the file. This information will be only used for calculating progress. + * + * @param name + * name of the file + * @param data + * factory for the InputStream which contains the file. This can be + * called more than once. + * @param size + * total number of bytes for the file. This information will be only + * used for calculating progress. * @return system dictionary builder - * @throws IOException if IO fails + * @throws IOException + * if IO fails */ public DicBuilder.System matrix(String name, IOSupplier data, long size) throws IOException { return inner.readMatrix(name, data, size); } /** - * Read connection matrix from MeCab matrix.def format text file. Classpath version. - * @param data name of the file + * Read connection matrix from MeCab matrix.def format text file. Classpath + * version. + * + * @param data + * name of the file * @return system dictionary builder - * @throws IOException if IO fails + * @throws IOException + * if IO fails */ public DicBuilder.System matrix(URL data) throws IOException { String name = data.getPath(); @@ -218,10 +252,14 @@ public DicBuilder.System matrix(URL data) throws IOException { } /** - * Read connection matrix from MeCab matrix.def format text file. Filesystem version. - * @param path path to matrix.def format file + * Read connection matrix from MeCab matrix.def format text file. Filesystem + * version. + * + * @param path + * path to matrix.def format file * @return system dictionary builder - * @throws IOException if IO fails + * @throws IOException + * if IO fails */ public DicBuilder.System matrix(Path path) throws IOException { String name = path.getFileName().toString(); @@ -239,6 +277,7 @@ private User(DictionaryAccess system) { /** * Create a new system dictionary compiler + * * @return new dictionary compiler object */ public static SystemNoMatrix system() { @@ -246,8 +285,11 @@ public static SystemNoMatrix system() { } /** - * Create a new user dictionary compiler which will reference the provided user dictionary. - * @param system referenced dictionary + * Create a new user dictionary compiler which will reference the provided user + * dictionary. + * + * @param system + * referenced dictionary * @return new dictionary compiler object */ public static User user(DictionaryAccess system) { diff --git a/src/test/java/com/worksap/nlp/sudachi/MeCabOovProviderPluginTest.java b/src/test/java/com/worksap/nlp/sudachi/MeCabOovProviderPluginTest.java index 092fcb69..b4eafb57 100644 --- a/src/test/java/com/worksap/nlp/sudachi/MeCabOovProviderPluginTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/MeCabOovProviderPluginTest.java @@ -404,8 +404,8 @@ public void readOOV() throws IOException { plugin.readOOV(oovConfig, new MockGrammar(), "forbid"); assertThat(plugin.oovList.size(), is(1)); assertThat(plugin.oovList.get(CategoryType.DEFAULT).size(), is(2)); - assertThat(plugin.oovList.get(CategoryType.DEFAULT).get(0), is(LatticeNodeImpl.oovFactory((short) 1, (short) 2, - (short) 3, (short) 0))); + assertThat(plugin.oovList.get(CategoryType.DEFAULT).get(0), + is(LatticeNodeImpl.oovFactory((short) 1, (short) 2, (short) 3, (short) 0))); } @Test(expected = IllegalArgumentException.class) diff --git a/src/test/java/com/worksap/nlp/sudachi/TestLattice.java b/src/test/java/com/worksap/nlp/sudachi/TestLattice.java index 24a6bff3..98baf971 100644 --- a/src/test/java/com/worksap/nlp/sudachi/TestLattice.java +++ b/src/test/java/com/worksap/nlp/sudachi/TestLattice.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2024 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi; public class TestLattice { diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DescriptionTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DescriptionTest.kt index 4d8896fb..131fa907 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DescriptionTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DescriptionTest.kt @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2024 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi.dictionary import com.worksap.nlp.sudachi.dictionary.build.InMemoryChannel @@ -5,25 +21,25 @@ import kotlin.test.Test import kotlin.test.assertEquals class DescriptionTest { - @Test - fun serialization() { - val d = Description() - d.blocks = listOf(Description.Block("test", 5, 15), Description.Block("test2", 30, 25)) - d.reference = "testref" - d.comment = "コメント" - val chan = InMemoryChannel(4096) - d.save(chan) - chan.position(0) - val d2 = Description.load(chan) - assertEquals(d.comment, d2.comment) - assertEquals(d.reference, d2.reference) - assertEquals(d.signature, d2.signature) - assertEquals(d.blocks.size, d2.blocks.size) - assertEquals(d.blocks[0].name, d2.blocks[0].name) - assertEquals(d.blocks[0].start, d2.blocks[0].start) - assertEquals(d.blocks[0].size, d2.blocks[0].size) - assertEquals(d.blocks[1].name, d2.blocks[1].name) - assertEquals(d.blocks[1].start, d2.blocks[1].start) - assertEquals(d.blocks[1].size, d2.blocks[1].size) - } -} \ No newline at end of file + @Test + fun serialization() { + val d = Description() + d.blocks = listOf(Description.Block("test", 5, 15), Description.Block("test2", 30, 25)) + d.reference = "testref" + d.comment = "コメント" + val chan = InMemoryChannel(4096) + d.save(chan) + chan.position(0) + val d2 = Description.load(chan) + assertEquals(d.comment, d2.comment) + assertEquals(d.reference, d2.reference) + assertEquals(d.signature, d2.signature) + assertEquals(d.blocks.size, d2.blocks.size) + assertEquals(d.blocks[0].name, d2.blocks[0].name) + assertEquals(d.blocks[0].start, d2.blocks[0].start) + assertEquals(d.blocks[0].size, d2.blocks[0].size) + assertEquals(d.blocks[1].name, d2.blocks[1].name) + assertEquals(d.blocks[1].start, d2.blocks[1].start) + assertEquals(d.blocks[1].size, d2.blocks[1].size) + } +} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.java index aac3071b..1f4452ba 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.java @@ -31,108 +31,73 @@ import org.junit.Test; public class DoubleArrayLexiconTest { -/* - static final int GRAMMAR_SIZE = 470; - - DoubleArrayLexicon lexicon; - - @Before - public void setUp() throws IOException { - ByteBuffer bytes = TestDictionary.INSTANCE.getSystemDictData().buffer(); - DictionaryHeader header = new DictionaryHeader(bytes, 0); - lexicon = new DoubleArrayLexicon(bytes, header.storageSize() + GRAMMAR_SIZE, true); - } - - @Test - public void lookup() { - List results = iteratorToList(lexicon.lookup("東京都".getBytes(StandardCharsets.UTF_8), 0)); - - assertEquals(3, results.size()); - assertArrayEquals(new int[] { 4, 3 }, results.get(0)); // 東 - assertArrayEquals(new int[] { 5, 6 }, results.get(1)); // 東京 - assertArrayEquals(new int[] { 6, 9 }, results.get(2)); // 東京都 - - results = iteratorToList(lexicon.lookup("東京都に".getBytes(StandardCharsets.UTF_8), 9)); - assertEquals(2, results.size()); - assertArrayEquals(new int[] { 1, 12 }, results.get(0)); // に(接続助詞) - assertArrayEquals(new int[] { 2, 12 }, results.get(1)); // に(格助詞) - - results = iteratorToList(lexicon.lookup("あれ".getBytes(StandardCharsets.UTF_8), 0)); - assertEquals(0, results.size()); - } - - @Test - public void parameters() { - // た - assertEquals(1, lexicon.getLeftId(0)); - assertEquals(1, lexicon.getRightId(0)); - assertEquals(8729, lexicon.getCost(0)); - - // 東京都 - assertEquals(6, lexicon.getLeftId(6)); - assertEquals(8, lexicon.getRightId(6)); - assertEquals(5320, lexicon.getCost(6)); - - // 都 - assertEquals(8, lexicon.getLeftId(9)); - assertEquals(8, lexicon.getRightId(9)); - assertEquals(2914, lexicon.getCost(9)); - } - - @Test - public void wordInfo() { - // た - WordInfo wi = lexicon.getWordInfo(0); - assertEquals("た", wi.getSurface()); - assertEquals(3, wi.getLength()); - assertEquals(0, wi.getPOSId()); - assertEquals("た", wi.getNormalizedForm()); - assertEquals(-1, wi.getDictionaryForm()); - assertEquals("た", wi.getDictionaryForm()); - assertEquals("タ", wi.getReadingForm()); - assertArrayEquals(new int[0], wi.getAunitSplit()); - assertArrayEquals(new int[0], wi.getBunitSplit()); - assertArrayEquals(new int[0], wi.getWordStructure()); - - // 行っ - wi = lexicon.getWordInfo(8); - assertEquals("行っ", wi.getSurface()); - assertEquals("行く", wi.getNormalizedForm()); - assertEquals(7, wi.getDictionaryForm()); - assertEquals("行く", wi.getDictionaryForm()); - - // 東京都 - wi = lexicon.getWordInfo(6); - assertEquals("東京都", wi.getSurface()); - assertArrayEquals(new int[] { 5, 9 }, wi.getAunitSplit()); - assertArrayEquals(new int[0], wi.getBunitSplit()); - assertArrayEquals(new int[] { 5, 9 }, wi.getWordStructure()); - assertArrayEquals(new int[0], wi.getSynonymGroupIds()); - } - - @Test - public void wordInfoWithLongWord() { - // 0123456789 * 30 - WordInfo wi = lexicon.getWordInfo(36); - assertEquals(300, wi.getSurface().length()); - assertEquals(300, wi.getLength()); - assertEquals(300, wi.getNormalizedForm().length()); - assertEquals(-1, wi.getDictionaryForm()); - assertEquals(300, wi.getDictionaryForm().length()); - assertEquals(570, wi.getReadingForm().length()); - } - - @Test - public void size() { - assertEquals(39, lexicon.size()); - } - - static List iteratorToList(Iterator iterator) { - List result = new ArrayList<>(); - while (iterator.hasNext()) { - result.add(iterator.next()); - } - return result; - } - */ + /* + * static final int GRAMMAR_SIZE = 470; + * + * DoubleArrayLexicon lexicon; + * + * @Before public void setUp() throws IOException { ByteBuffer bytes = + * TestDictionary.INSTANCE.getSystemDictData().buffer(); DictionaryHeader header + * = new DictionaryHeader(bytes, 0); lexicon = new DoubleArrayLexicon(bytes, + * header.storageSize() + GRAMMAR_SIZE, true); } + * + * @Test public void lookup() { List results = + * iteratorToList(lexicon.lookup("東京都".getBytes(StandardCharsets.UTF_8), 0)); + * + * assertEquals(3, results.size()); assertArrayEquals(new int[] { 4, 3 }, + * results.get(0)); // 東 assertArrayEquals(new int[] { 5, 6 }, results.get(1)); + * // 東京 assertArrayEquals(new int[] { 6, 9 }, results.get(2)); // 東京都 + * + * results = + * iteratorToList(lexicon.lookup("東京都に".getBytes(StandardCharsets.UTF_8), 9)); + * assertEquals(2, results.size()); assertArrayEquals(new int[] { 1, 12 }, + * results.get(0)); // に(接続助詞) assertArrayEquals(new int[] { 2, 12 }, + * results.get(1)); // に(格助詞) + * + * results = + * iteratorToList(lexicon.lookup("あれ".getBytes(StandardCharsets.UTF_8), 0)); + * assertEquals(0, results.size()); } + * + * @Test public void parameters() { // た assertEquals(1, lexicon.getLeftId(0)); + * assertEquals(1, lexicon.getRightId(0)); assertEquals(8729, + * lexicon.getCost(0)); + * + * // 東京都 assertEquals(6, lexicon.getLeftId(6)); assertEquals(8, + * lexicon.getRightId(6)); assertEquals(5320, lexicon.getCost(6)); + * + * // 都 assertEquals(8, lexicon.getLeftId(9)); assertEquals(8, + * lexicon.getRightId(9)); assertEquals(2914, lexicon.getCost(9)); } + * + * @Test public void wordInfo() { // た WordInfo wi = lexicon.getWordInfo(0); + * assertEquals("た", wi.getSurface()); assertEquals(3, wi.getLength()); + * assertEquals(0, wi.getPOSId()); assertEquals("た", wi.getNormalizedForm()); + * assertEquals(-1, wi.getDictionaryForm()); assertEquals("た", + * wi.getDictionaryForm()); assertEquals("タ", wi.getReadingForm()); + * assertArrayEquals(new int[0], wi.getAunitSplit()); assertArrayEquals(new + * int[0], wi.getBunitSplit()); assertArrayEquals(new int[0], + * wi.getWordStructure()); + * + * // 行っ wi = lexicon.getWordInfo(8); assertEquals("行っ", wi.getSurface()); + * assertEquals("行く", wi.getNormalizedForm()); assertEquals(7, + * wi.getDictionaryForm()); assertEquals("行く", wi.getDictionaryForm()); + * + * // 東京都 wi = lexicon.getWordInfo(6); assertEquals("東京都", wi.getSurface()); + * assertArrayEquals(new int[] { 5, 9 }, wi.getAunitSplit()); + * assertArrayEquals(new int[0], wi.getBunitSplit()); assertArrayEquals(new + * int[] { 5, 9 }, wi.getWordStructure()); assertArrayEquals(new int[0], + * wi.getSynonymGroupIds()); } + * + * @Test public void wordInfoWithLongWord() { // 0123456789 * 30 WordInfo wi = + * lexicon.getWordInfo(36); assertEquals(300, wi.getSurface().length()); + * assertEquals(300, wi.getLength()); assertEquals(300, + * wi.getNormalizedForm().length()); assertEquals(-1, wi.getDictionaryForm()); + * assertEquals(300, wi.getDictionaryForm().length()); assertEquals(570, + * wi.getReadingForm().length()); } + * + * @Test public void size() { assertEquals(39, lexicon.size()); } + * + * static List iteratorToList(Iterator iterator) { List result = + * new ArrayList<>(); while (iterator.hasNext()) { result.add(iterator.next()); + * } return result; } + */ } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/SystemDicTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/SystemDicTest.kt index 2650248c..a70fb39d 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/SystemDicTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/SystemDicTest.kt @@ -24,11 +24,11 @@ import com.worksap.nlp.sudachi.wordInfo import kotlin.test.* fun DicBuilder.System.lexicon(s: String): DicBuilder.System { - return this.lexicon("test", {s.byteInputStream()}, s.length.toLong()) + return this.lexicon("test", { s.byteInputStream() }, s.length.toLong()) } fun DicBuilder.User.lexicon(s: String): DicBuilder.User { - return this.lexicon("test", {s.byteInputStream()}, s.length.toLong()) + return this.lexicon("test", { s.byteInputStream() }, s.length.toLong()) } class SystemDicTest { @@ -49,9 +49,7 @@ class SystemDicTest { val bldr = DicBuilder.system().matrix(javaClass.getResource("test.matrix")) val data = MemChannel() repeat(10) { bldr.lexicon(javaClass.getResource("one.csv")) } - bldr - .lexicon("南,1,1,4675,南,名詞,普通名詞,一般,*,*,*,ミナミ,西,5,C,0/1,2/3,4/5,6/7") - .build(data) + bldr.lexicon("南,1,1,4675,南,名詞,普通名詞,一般,*,*,*,ミナミ,西,5,C,0/1,2/3,4/5,6/7").build(data) val dic = BinaryDictionary(data.buffer()) assertEquals(11, dic.lexicon.size()) assertEquals(POS("名詞", "普通名詞", "一般", "*", "*", "*"), dic.grammar.getPartOfSpeechString(0)) @@ -87,12 +85,8 @@ class SystemDicTest { @Test fun failMatrixSizeValidation() { val bldr = DicBuilder.system().matrix(res("test.matrix")) - assertFails { - bldr.lexicon("東,4,1,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,*,A,*,*,*,*") - } - assertFails { - bldr.lexicon("東,1,4,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,*,A,*,*,*,*") - } + assertFails { bldr.lexicon("東,4,1,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,*,A,*,*,*,*") } + assertFails { bldr.lexicon("東,1,4,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,*,A,*,*,*,*") } } @Test @@ -103,8 +97,7 @@ class SystemDicTest { .lexicon( """東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* 東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,0/2,*,0/2,* - 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""" - .trimIndent()) + 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) .build(data) val dic = BinaryDictionary(data.buffer()) assertEquals(3, dic.lexicon.size()) @@ -121,8 +114,7 @@ class SystemDicTest { .lexicon( """東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* 東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/2",*,0/2,* - 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""" - .trimIndent()) + 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) .build(data) val dic = BinaryDictionary(data.buffer()) assertEquals(3, dic.lexicon.size()) @@ -139,8 +131,7 @@ class SystemDicTest { .lexicon( """東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* 東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,0/2,0/2,* - 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""" - .trimIndent()) + 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) .build(data) val dic = BinaryDictionary(data.buffer()) assertEquals(3, dic.lexicon.size()) @@ -157,8 +148,7 @@ class SystemDicTest { .lexicon( """東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* 東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,0/2,U0/U2,* - 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""" - .trimIndent()) + 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) .build(data) val dic = BinaryDictionary(data.buffer()) assertEquals(3, dic.lexicon.size()) @@ -177,8 +167,7 @@ class SystemDicTest { @Test fun failInvalidNumberOfInlineRefFields() { val bldr = DicBuilder.system().matrix(res("test.matrix")) - bldr.lexicon( - """東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,"a,b,c,d,e",*,*""") + bldr.lexicon("""東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,"a,b,c,d,e",*,*""") assertFails { bldr.build(MemChannel()) } } @@ -187,8 +176,7 @@ class SystemDicTest { val bldr = DicBuilder.system().matrix(res("test.matrix")) bldr.lexicon( """東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* - 東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,"東京,名詞,固有名詞,地名,一般,*,*,a",*,*""" - .trimMargin()) + 東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,"東京,名詞,固有名詞,地名,一般,*,*,a",*,*""".trimMargin()) assertFails { bldr.build(MemChannel()) } } @@ -200,8 +188,7 @@ class SystemDicTest { val surf = "a".repeat(1024) + istr val read = "b".repeat(1024) + istr val norm = "c".repeat(1024) + istr - bldr.lexicon( - "$surf,1,1,2816,$surf,名詞,固有名詞,地名,一般,*,*,$read,$norm,*,A,*,*,*,*") + bldr.lexicon("$surf,1,1,2816,$surf,名詞,固有名詞,地名,一般,*,*,$read,$norm,*,A,*,*,*,*") } val ch = MemChannel() bldr.build(ch) diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt index 8be4a3ef..8cb557c2 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt @@ -19,7 +19,6 @@ package com.worksap.nlp.sudachi.dictionary.build import com.worksap.nlp.sudachi.* import com.worksap.nlp.sudachi.dictionary.BinaryDictionary import com.worksap.nlp.sudachi.dictionary.DictionaryAccess -import com.worksap.nlp.sudachi.dictionary.POS import java.net.URL import kotlin.test.Test import kotlin.test.assertContentEquals @@ -72,8 +71,8 @@ class UserDicTest { .load() val da = dic as DictionaryAccess - assertEquals(dic.partOfSpeechSize, 2) - val wi = da.morpheme(WordId.make(1, 0)) + assertEquals(dic.partOfSpeechSize, 2) + val wi = da.morpheme(WordId.make(1, 0)) assertEquals(wi.surface(), "東京都") assertEquals(wi.readingForm(), "トウキョウト") } diff --git a/src/test/java/com/worksap/nlp/sudachi/morphemes.kt b/src/test/java/com/worksap/nlp/sudachi/morphemes.kt index 6535d18e..ac05f3c4 100644 --- a/src/test/java/com/worksap/nlp/sudachi/morphemes.kt +++ b/src/test/java/com/worksap/nlp/sudachi/morphemes.kt @@ -1,27 +1,41 @@ +/* + * Copyright (c) 2024 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.worksap.nlp.sudachi import com.worksap.nlp.sudachi.dictionary.DictionaryAccess -import com.worksap.nlp.sudachi.dictionary.Lexicon import com.worksap.nlp.sudachi.dictionary.POS import com.worksap.nlp.sudachi.dictionary.WordInfo - fun DictionaryAccess.morpheme(id: Int): Morpheme { - val node = LatticeNodeImpl(lexicon, 0, id) + val node = LatticeNodeImpl(lexicon, 0, id) - val l = MorphemeList( - UTF8InputTextBuilder(node.baseSurface, grammar).build(), - grammar, - lexicon, - listOf(node), - false, - Tokenizer.SplitMode.A - ) - return l[0] + val l = + MorphemeList( + UTF8InputTextBuilder(node.baseSurface, grammar).build(), + grammar, + lexicon, + listOf(node), + false, + Tokenizer.SplitMode.A) + return l[0] } val Morpheme.wordInfo: WordInfo - get() = (this as MorphemeImpl).wordInfo + get() = (this as MorphemeImpl).wordInfo val String.pos: POS - get() = POS(this.split(",")) \ No newline at end of file + get() = POS(this.split(",")) From e376920c443569ac266d8827e9da24d114accce3 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 17 Jul 2024 10:14:31 +0900 Subject: [PATCH 19/94] better build progress --- .../sudachi/dictionary/DictionaryBuilder.java | 11 ++++++----- .../sudachi/dictionary/build/BlockOutput.java | 2 +- .../nlp/sudachi/dictionary/build/DicBuilder.java | 16 ++++++++++++++-- .../sudachi/dictionary/build/ModelOutput.java | 4 ++-- .../nlp/sudachi/dictionary/build/Progress.java | 6 ++++-- 5 files changed, 27 insertions(+), 12 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilder.java index fbfdda82..3613a46f 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilder.java @@ -38,10 +38,11 @@ private DictionaryBuilder() { static void printUsage() { Console console = System.console(); - console.printf("usage: DictionaryBuilder -o file -m file [-d description] files...\n"); + console.printf("usage: DictionaryBuilder -o file -m file [-d description] [-s signature] files...\n"); console.printf("\t-o file\toutput to file\n"); console.printf("\t-m file\tmatrix file\n"); console.printf("\t-d description\tcomment\n"); + console.printf("\t-s signature\tsignature\n"); } /** @@ -93,8 +94,8 @@ public static void main(String[] args) throws IOException { List lexiconPaths = Arrays.asList(args).subList(i, args.length); - DicBuilder.System builder = DicBuilder.system().matrix(Paths.get(matrixPath)).comment(description) - .progress(new Progress(20, new StderrProgress())); + DicBuilder.System builder = DicBuilder.system().progress(new Progress(20, new StderrProgress())) + .matrix(Paths.get(matrixPath)).comment(description); if (signature != null) { builder.signature(signature); @@ -119,10 +120,10 @@ public void start(String name, Progress.Kind kind) { System.err.printf("%s\t", name); last = 0; switch (kind) { - case OUTPUT: + case BYTE: unit = "bytes"; break; - case INPUT: + case ENTRY: unit = "entries"; break; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockOutput.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockOutput.java index eb3c4e57..2fdd7fbd 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockOutput.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockOutput.java @@ -41,7 +41,7 @@ public Progress getProgress() { public T measured(String name, IOFunction fun) throws IOException { Progress p = progress; long start = chan.position(); - p.startBlock(name, System.nanoTime(), Progress.Kind.OUTPUT); + p.startBlock(name, System.nanoTime(), Progress.Kind.BYTE); T result = fun.apply(p); long size = chan.position() - start; p.endBlock(size, System.nanoTime()); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java index d5d00cb2..904f1dac 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java @@ -71,7 +71,7 @@ private T self() { * when IO fails */ public T lexicon(String name, IOSupplier input, long size) throws IOException { - progress.startBlock(name, nanoTime(), Progress.Kind.INPUT); + progress.startBlock(name, nanoTime(), Progress.Kind.BYTE); try (InputStream is = input.get()) { InputStream stream = new TrackingInputStream(is); lexicon.read(name, stream, pos); @@ -175,7 +175,7 @@ public void build(SeekableByteChannel channel) throws IOException { public static final class System extends Base { private System readMatrix(String name, IOSupplier input, long size) throws IOException { - progress.startBlock(name, nanoTime(), Progress.Kind.INPUT); + progress.startBlock(name, nanoTime(), Progress.Kind.BYTE); try (InputStream is = input.get()) { InputStream stream = new ProgressInputStream(is, size, progress); connection.readEntries(stream); @@ -266,6 +266,18 @@ public DicBuilder.System matrix(Path path) throws IOException { long size = Files.size(path); return matrix(name, () -> Files.newInputStream(path), size); } + + /** + * Set the progress handler to the provided one + * + * @param progress + * handler + * @return current object + */ + public SystemNoMatrix progress(Progress progress) { + inner.progress(progress); + return this; + } } public static final class User extends Base { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ModelOutput.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ModelOutput.java index 7c7e4dc5..1d215acd 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ModelOutput.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ModelOutput.java @@ -113,7 +113,7 @@ public void withPart(String name, IORunnable inner) throws IOException { long pos = position(); long start = System.nanoTime(); if (progressor != null) { - progressor.startBlock(name, start, Progress.Kind.OUTPUT); + progressor.startBlock(name, start, Progress.Kind.BYTE); } inner.run(); long time = System.nanoTime() - start; @@ -127,7 +127,7 @@ public void withPart(String name, IORunnable inner) throws IOException { public void withSizedPart(String name, SizedRunnable inner) throws IOException { long start = System.nanoTime(); if (progressor != null) { - progressor.startBlock(name, start, Progress.Kind.OUTPUT); + progressor.startBlock(name, start, Progress.Kind.BYTE); } long size = inner.run(); long time = System.nanoTime() - start; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java index abeb91af..784d2719 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java @@ -23,6 +23,7 @@ public class Progress { private final int maxUpdates; private final Callback callback; private float currentProgress; + private long startTime; private long lastUpdate; public static final Progress NOOP = new Progress(1, progress -> { @@ -34,6 +35,7 @@ public Progress(int maxUpdates, Callback callback) { } public void startBlock(String name, long start, Kind kind) { + startTime = start; lastUpdate = start; callback.start(name, kind); currentProgress = step(); @@ -72,11 +74,11 @@ public void progress(long cur, long max) { } public void endBlock(long size, long time) { - callback.end(size, Duration.ofNanos(time)); + callback.end(size, Duration.ofNanos(time - startTime)); } public enum Kind { - INPUT, OUTPUT + BYTE, ENTRY } /** From 86306bad0e97a0cd788491aa49a328de63f2952e Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Thu, 18 Jul 2024 09:32:45 +0900 Subject: [PATCH 20/94] load grammar --- .../nlp/sudachi/dictionary/BufReader.java | 26 +++++++++++++++++++ .../nlp/sudachi/dictionary/GrammarImpl.java | 24 ++++++++++++++++- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/BufReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/BufReader.java index edabfb4e..f55d2aba 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/BufReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/BufReader.java @@ -30,6 +30,14 @@ long readLong() { return buffer.getLong(); } + short readShort() { + return buffer.getShort(); + } + + char readChar() { + return buffer.getChar(); + } + long readVarint64() { ByteBuffer b = buffer; int first = b.get() & 0xff; @@ -96,6 +104,24 @@ public int readVarint32() { return (int) l; } + public String readShortString() { + short length = readShort(); + + // remember buffer state + int originalLimit = buffer.limit(); + int stringLimit = buffer.position() + length * 2; + buffer.limit(stringLimit); + + // implementation: use the fact that CharBuffers are CharSequences + // and the fact that ByteBuffer can be used as CharBuffer + String result = buffer.asCharBuffer().toString(); + + // adjust state + buffer.position(stringLimit); + buffer.limit(originalLimit); + return result; + } + public String readUtf8String() { int length = readVarint32(); if (buffer.remaining() < length) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/GrammarImpl.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/GrammarImpl.java index 93af0a47..dc44cd06 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/GrammarImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/GrammarImpl.java @@ -70,9 +70,31 @@ public GrammarImpl() { originalPosSize = 0; } + public GrammarImpl(List posList, Connection matrix) { + bytes = ByteBuffer.allocate(0); + this.posList = posList; + this.matrix = matrix; + originalPosSize = (short) posList.size(); + } + public static GrammarImpl load(ByteBuffer binaryDic, Description header) { Connection matrix = Connection.fromByteBufferV1(header.slice(binaryDic, Blocks.CONNECTION_MATRIX)); - return new GrammarImpl(); + List posList = loadPosList(header.slice(binaryDic, Blocks.POS_TABLE)); + return new GrammarImpl(posList, matrix); + } + + private static List loadPosList(ByteBuffer bytes) { + BufReader reader = new BufReader(bytes); + int posSize = reader.readShort(); + List posList = new ArrayList<>(posSize); + for (int i = 0; i < posSize; i++) { + String[] pos = new String[POS_DEPTH]; + for (int j = 0; j < POS_DEPTH; j++) { + pos[j] = reader.readShortString(); + } + posList.add(new POS(pos)); + } + return posList; } public int storageSize() { From e3417cd2931c2bd81673abd303a41974fb6979bc Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Thu, 18 Jul 2024 11:24:58 +0900 Subject: [PATCH 21/94] refactor: rename etc. --- .../nlp/sudachi/dictionary/Description.java | 8 +-- .../sudachi/dictionary/build/BufWriter.java | 50 ++++++++++++------- ...nneledBuffer.java => BufferedChannel.java} | 6 +-- .../nlp/sudachi/dictionary/build/Index.java | 2 +- .../sudachi/dictionary/build/POSTable.java | 2 +- .../sudachi/dictionary/build/RawLexicon.java | 2 +- .../dictionary/build/RawLexiconReader.java | 6 ++- .../dictionary/build/RawWordEntry.java | 6 +-- .../dictionary/build/WordEntryLayout.java | 6 +-- .../nlp/sudachi/dictionary/BufReaderTest.kt | 2 +- 10 files changed, 54 insertions(+), 36 deletions(-) rename src/main/java/com/worksap/nlp/sudachi/dictionary/build/{ChanneledBuffer.java => BufferedChannel.java} (95%) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java index 947b3eb5..ef513db5 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java @@ -179,15 +179,15 @@ public void save(SeekableByteChannel channel) throws IOException { writer.putLong(1); // version writer.putLong(creationTime.getEpochSecond()); writer.putLong(flags); - writer.putStringUtf8(comment); - writer.putStringUtf8(signature); - writer.putStringUtf8(reference); + writer.putUtf8String(comment); + writer.putUtf8String(signature); + writer.putUtf8String(reference); writer.putVarint32(numIndexedEntries); writer.putVarint32(numTotalEntries); int length = blocks.size(); writer.putVarint32(length); for (Block b : blocks) { - writer.putStringUtf8(b.name); + writer.putUtf8String(b.name); writer.putVarint64(b.start); writer.putVarint64(b.size); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java index 117aef7a..e4082e6f 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java @@ -33,7 +33,28 @@ public BufWriter putByte(byte val) { return this; } - // Encode int as LEB128 + public BufWriter putShort(short val) { + buffer.putShort(val); + return this; + } + + public BufWriter putInt(int val) { + buffer.putInt(val); + return this; + } + + public BufWriter putLong(long x) { + buffer.putLong(x); + return this; + } + + /** + * Envode int as LEB128 + * + * @param val + * value to encode + * @return this + */ public BufWriter putVarint32(int val) { if ((val & ~0x7f) == 0) { putByte((byte) val); @@ -43,6 +64,13 @@ public BufWriter putVarint32(int val) { return this; } + /** + * Envode long as LEB128 + * + * @param val + * value to encode + * @return this + */ public BufWriter putVarint64(long val) { if ((val & ~0x7fL) == 0) { putByte((byte) val); @@ -61,16 +89,6 @@ private void putVarintSlow(long val) { putByte((byte) val); } - public BufWriter putShort(short val) { - buffer.putShort(val); - return this; - } - - public BufWriter putInt(int val) { - buffer.putInt(val); - return this; - } - public BufWriter putInts(Ints value, int length) { if (length <= 0) { return this; @@ -85,10 +103,10 @@ public BufWriter putInts(Ints value, int length) { } /** - * Put string which has length is shorter than Short.MAX_VALUE + * Encode string which has length is shorter than Short.MAX_VALUE * * @param s - * string to put in the buffer + * string to put in the buffer. Must be shorter than Short.MAX_VALUE. */ public void putShortString(String s) { int length = s.length(); @@ -99,14 +117,10 @@ public void putShortString(String s) { } } - public BufWriter putStringUtf8(String s) { + public BufWriter putUtf8String(String s) { byte[] bytes = s.getBytes(StandardCharsets.UTF_8); putVarint32(bytes.length); buffer.put(bytes); return this; } - - public void putLong(long x) { - buffer.putLong(x); - } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ChanneledBuffer.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufferedChannel.java similarity index 95% rename from src/main/java/com/worksap/nlp/sudachi/dictionary/build/ChanneledBuffer.java rename to src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufferedChannel.java index a51f5145..939cf839 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ChanneledBuffer.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufferedChannel.java @@ -22,19 +22,19 @@ import java.nio.CharBuffer; import java.nio.channels.WritableByteChannel; -public class ChanneledBuffer { +public class BufferedChannel { private final ByteBuffer buffer; private final WritableByteChannel channel; private int offset; - public ChanneledBuffer(WritableByteChannel channel, int size) { + public BufferedChannel(WritableByteChannel channel, int size) { this.channel = channel; this.buffer = ByteBuffer.allocate(size); buffer.order(ByteOrder.LITTLE_ENDIAN); } - public ChanneledBuffer(WritableByteChannel channel) { + public BufferedChannel(WritableByteChannel channel) { this(channel, 64 * 1024); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java index 32bb85f6..28d6237d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java @@ -104,7 +104,7 @@ private TrieData writeWordTable(BlockOutput out, List n int size = this.elements.size(); byte[][] keys = new byte[size][]; int[] values = new int[size]; - ChanneledBuffer buffer = new ChanneledBuffer(out.getChannel(), + BufferedChannel buffer = new BufferedChannel(out.getChannel(), Math.max((notIndexed.size() + 16) * 5, 64 * 1024)); out.measured("Word Id table", (p) -> { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java index a6b2c44c..3c6d2eed 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java @@ -78,7 +78,7 @@ public int ownedLength() { public Void compile(BlockOutput out) throws IOException { return out.measured("POS Table", (p) -> { - ChanneledBuffer cbuf = new ChanneledBuffer(out.getChannel()); + BufferedChannel cbuf = new BufferedChannel(out.getChannel()); cbuf.byteBuffer(2).putShort((short) table.size()); for (int i = 0; i < table.size(); ++i) { BufWriter writer = cbuf.writer(POS.MAX_BINARY_LENGTH); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index 62801b34..d8e654f6 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -92,7 +92,7 @@ private Void writeEntries(POSTable pos, BlockOutput blockOutput) throws IOExcept List list = entries; Lookup2 lookup = new Lookup2(list); WordRef.Parser refParser = WordRef.parser(pos, !user, false); - ChanneledBuffer buf = new ChanneledBuffer(blockOutput.getChannel(), WordEntryLayout.MAX_LENGTH * 4); + BufferedChannel buf = new BufferedChannel(blockOutput.getChannel(), WordEntryLayout.MAX_LENGTH * 4); buf.position(INITIAL_OFFSET); WordEntryLayout layout = new WordEntryLayout(lookup, strings, refParser, buf); int size = list.size(); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index de450165..fe40c7ee 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -55,11 +55,15 @@ public RawLexiconReader(CSVParser parser, POSTable pos, boolean user) throws IOE this.parser = parser; this.posTable = pos; resolveColumnLayout(); - refParser = WordRef.parser(pos, mapping == null || !user, mapping == null); + refParser = WordRef.parser(pos, isLegacyColumnLayout() || !user, isLegacyColumnLayout()); } private static final Pattern INTEGER_REGEX = Pattern.compile("^-?\\d+$"); + private boolean isLegacyColumnLayout() { + return mapping == null; + } + private void resolveColumnLayout() throws IOException { List record = parser.getNextRecord(); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java index 2ab7df12..742eb47c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java @@ -49,11 +49,11 @@ private int countRefs(String data, String prev) { if (data == null || data.isEmpty() || "*".equals(data) || data.equals(prev)) { return 0; } - int nsplits = StringUtil.count(data, '/'); - if (nsplits >= CsvLexicon.ARRAY_MAX_LENGTH) { + int nsplits = StringUtil.count(data, '/') + 1; + if (nsplits > CsvLexicon.ARRAY_MAX_LENGTH) { throw new CsvFieldException("maximum number of splits were exceeded"); } - return nsplits + 1; + return nsplits; } /** diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java index 6f3e4253..f2a39f68 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java @@ -28,7 +28,7 @@ public class WordEntryLayout { private final StringIndex index; private final WordRef.Parser wordRefParser; private final Lookup2 lookup; - private final ChanneledBuffer buffer; + private final BufferedChannel buffer; private final Ints aSplits = new Ints(16); private final Ints bSplits = new Ints(16); private final Ints cSplits = new Ints(16); @@ -36,10 +36,10 @@ public class WordEntryLayout { private final Ints synonymGroups = new Ints(16); public static final int MAX_LENGTH = 32 // basic size - + Byte.MAX_VALUE * 5 * 4 // splits and synonyms + + Byte.MAX_VALUE * 4 * 5 // splits and synonyms + (Short.MAX_VALUE + 1) * 2; // user data - public WordEntryLayout(Lookup2 resolver, StringIndex index, WordRef.Parser parser, ChanneledBuffer buffer) { + public WordEntryLayout(Lookup2 resolver, StringIndex index, WordRef.Parser parser, BufferedChannel buffer) { this.lookup = resolver; this.index = index; this.wordRefParser = parser; diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/BufReaderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/BufReaderTest.kt index 9e67daec..255c30c5 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/BufReaderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/BufReaderTest.kt @@ -107,7 +107,7 @@ class BufReaderTest { @Test fun utf8String() { - val checkUtf8String = check({ w, x -> w.putStringUtf8(x) }, { it.readUtf8String() }) + val checkUtf8String = check({ w, x -> w.putUtf8String(x) }, { it.readUtf8String() }) checkUtf8String("") checkUtf8String("test") checkUtf8String("привет") From c549bc3b773618752530c590a5aa277b9ae2e4df Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Thu, 18 Jul 2024 16:59:38 +0900 Subject: [PATCH 22/94] add constants for magic numbers --- .../nlp/sudachi/dictionary/WordInfoList.java | 18 ++++++++++++++---- .../nlp/sudachi/dictionary/WordParameters.java | 4 ++-- .../sudachi/dictionary/build/BlockLayout.java | 5 +++-- .../sudachi/dictionary/build/RawLexicon.java | 5 +++-- .../sudachi/dictionary/build/RawWordEntry.java | 5 +++-- .../dictionary/build/WordEntryLayout.java | 7 ++++--- 6 files changed, 29 insertions(+), 15 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfoList.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfoList.java index 6d9b0826..0a24e62b 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfoList.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfoList.java @@ -19,6 +19,17 @@ import java.nio.ByteBuffer; public class WordInfoList { + public static final int ALIGNMENT_BITS = 3; + public static final int OFFSET_ALIGNMENT = 1 << ALIGNMENT_BITS; + + public static int wordId2offset(int wordId) { + return wordId << ALIGNMENT_BITS; + } + + public static int offset2wordId(long offset) { + return (int) (offset >>> ALIGNMENT_BITS); + } + private final ByteBuffer bytes; WordInfoList(ByteBuffer bytes) { @@ -26,15 +37,14 @@ public class WordInfoList { } public WordInfo getWordInfo(int wordId) { - int position = wordId * 8; - return WordInfo.read(bytes, position); + return WordInfo.read(bytes, wordId2offset(wordId)); } public int surfacePtr(int wordId) { - return WordInfo.surfaceForm(bytes, wordId * 8); + return WordInfo.surfaceForm(bytes, wordId2offset(wordId)); } public int readingPtr(int wordId) { - return WordInfo.readingForm(bytes, wordId * 8); + return WordInfo.readingForm(bytes, wordId2offset(wordId)); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameters.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameters.java index f092cd1e..cfcbb6a9 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameters.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameters.java @@ -27,12 +27,12 @@ private WordParameters(ByteBuffer data) { } public long loadParams(int wordId) { - int addr = wordId * 8; + int addr = WordInfoList.wordId2offset(wordId); return data.getLong(addr); } public void setCost(int wordId, short cost) { - int addr = wordId * 8 + 6; + int addr = WordInfoList.wordId2offset(wordId) + 6; data.putShort(addr, cost); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java index c754d52c..69872b1d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java @@ -25,6 +25,7 @@ import java.util.StringJoiner; public class BlockLayout { + private static final long BLOCK_SIZE = 4096; private final SeekableByteChannel channel; private final Progress progress; private final List info = new ArrayList<>(); @@ -32,7 +33,7 @@ public class BlockLayout { public BlockLayout(SeekableByteChannel channel, Progress progress) throws IOException { this.channel = channel; this.progress = progress; - channel.position(4096); + channel.position(BLOCK_SIZE); // keep first block for the description } public T block(String name, BlockHandler handler) throws IOException { @@ -40,7 +41,7 @@ public T block(String name, BlockHandler handler) throws IOException { long start = chan.position(); T result = handler.apply(new BlockOutput(chan, progress)); long end = chan.position(); - long newPosition = Align.align(end, 4096); + long newPosition = Align.align(end, BLOCK_SIZE); chan.position(newPosition); info.add(new BlockInfo(name, start, end)); return result; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index d8e654f6..43aeace6 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -19,6 +19,7 @@ import com.worksap.nlp.sudachi.dictionary.Blocks; import com.worksap.nlp.sudachi.dictionary.CSVParser; import com.worksap.nlp.sudachi.dictionary.DoubleArrayLexicon; +import com.worksap.nlp.sudachi.dictionary.WordInfoList; import java.io.IOException; import java.io.InputStream; @@ -29,7 +30,7 @@ import java.util.List; public class RawLexicon { - private static final long MAX_OFFSET = Integer.MAX_VALUE * 8L; + private static final long MAX_OFFSET = (long) Integer.MAX_VALUE * WordInfoList.OFFSET_ALIGNMENT; private static final int INITIAL_OFFSET = 32; private final StringStorage strings = new StringStorage(); private final List entries = new ArrayList<>(); @@ -69,7 +70,7 @@ public void read(String name, Reader data, POSTable posTable) throws IOException } public static int pointer(long offset) { - return (int) (offset >>> 3); + return WordInfoList.offset2wordId(offset); } public void checkOffset(long offset) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java index 742eb47c..41970ca4 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java @@ -19,6 +19,7 @@ import com.worksap.nlp.sudachi.StringUtil; import com.worksap.nlp.sudachi.dictionary.StringPtr; import com.worksap.nlp.sudachi.dictionary.WordInfo; +import com.worksap.nlp.sudachi.dictionary.WordInfoList; import java.util.List; import java.util.Objects; @@ -74,7 +75,7 @@ public int computeExpectedSize() { size += 2 + userData.length() * 2; } - size = Align.align(size, 8); + size = Align.align(size, WordInfoList.OFFSET_ALIGNMENT); return size; } @@ -140,7 +141,7 @@ public int addPhantomEntries(List list, Lookup2 lookup) { copy.mode = "A"; copy.posId = posId; RawWordEntry last = list.get(list.size() - 1); - copy.pointer = RawLexicon.pointer(last.pointer * 8L + last.computeExpectedSize()); + copy.pointer = RawLexicon.pointer(WordInfoList.wordId2offset(last.pointer) + last.computeExpectedSize()); list.add(copy); lookup.add(copy); return 1; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java index f2a39f68..eef68b8d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java @@ -18,6 +18,7 @@ import com.worksap.nlp.sudachi.StringUtil; import com.worksap.nlp.sudachi.dictionary.Ints; +import com.worksap.nlp.sudachi.dictionary.WordInfoList; import java.io.IOException; import java.nio.ByteBuffer; @@ -36,8 +37,8 @@ public class WordEntryLayout { private final Ints synonymGroups = new Ints(16); public static final int MAX_LENGTH = 32 // basic size - + Byte.MAX_VALUE * 4 * 5 // splits and synonyms - + (Short.MAX_VALUE + 1) * 2; // user data + + Byte.MAX_VALUE * Integer.BYTES * 5 // splits and synonyms + + (Short.MAX_VALUE + 1) * Character.BYTES; // user data public WordEntryLayout(Lookup2 resolver, StringIndex index, WordRef.Parser parser, BufferedChannel buffer) { this.lookup = resolver; @@ -99,7 +100,7 @@ public int put(RawWordEntry entry) throws IOException { } } - int position = this.buffer.alignTo(8); + int position = this.buffer.alignTo(WordInfoList.OFFSET_ALIGNMENT); return RawLexicon.pointer(position); } From 93bca39f94f8d7509e69729f56cf39ca133717c3 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 19 Jul 2024 11:47:15 +0900 Subject: [PATCH 23/94] add docstrings --- .../nlp/sudachi/dictionary/Description.java | 1 + .../worksap/nlp/sudachi/dictionary/POS.java | 1 + .../nlp/sudachi/dictionary/WordInfo.java | 31 +++++++++- .../nlp/sudachi/dictionary/build/Align.java | 9 +++ .../dictionary/build/BlockHandler.java | 23 -------- .../sudachi/dictionary/build/BlockLayout.java | 26 +++++++++ .../sudachi/dictionary/build/BlockOutput.java | 16 ++++++ .../sudachi/dictionary/build/BufWriter.java | 10 +++- .../dictionary/build/BufferedChannel.java | 52 +++++++++++++++++ .../dictionary/build/ConnectionMatrix.java | 14 +++++ .../sudachi/dictionary/build/DicBuilder.java | 13 ++++- .../nlp/sudachi/dictionary/build/Index.java | 46 +++++++++++++-- .../nlp/sudachi/dictionary/build/Lookup2.java | 22 ++++++++ .../sudachi/dictionary/build/POSTable.java | 26 +++++++++ .../sudachi/dictionary/build/Progress.java | 37 +++++++++++- .../dictionary/build/ProgressInputStream.java | 5 ++ .../sudachi/dictionary/build/RawLexicon.java | 45 ++++++++++++++- .../dictionary/build/RawLexiconReader.java | 9 +++ .../dictionary/build/RawWordEntry.java | 19 ++++++- .../dictionary/build/ResizableBuffer.java | 5 ++ .../dictionary/build/StringStorage.java | 56 ++++++++++++++++++- .../sudachi/dictionary/build/Unescape.java | 3 + .../build/UnicodeBufferResizeable.java | 3 + .../dictionary/build/WordEntryLayout.java | 29 +++++++--- .../sudachi/dictionary/build/WordLayout.java | 14 +++-- .../nlp/sudachi/dictionary/build/WordRef.java | 14 ++++- 26 files changed, 477 insertions(+), 52 deletions(-) delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockHandler.java diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java index ef513db5..5c3d76a1 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java @@ -192,6 +192,7 @@ public void save(SeekableByteChannel channel) throws IOException { writer.putVarint64(b.size); } + // write to the first block long pos = channel.position(); channel.position(0); buff.flip(); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/POS.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/POS.java index 3ca8f32c..a891c4b4 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/POS.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/POS.java @@ -29,6 +29,7 @@ public final class POS extends AbstractList { public final static int DEPTH = 6; public final static int MAX_COMPONENT_LENGTH = 127; + // each parts will be embedded in short-string format (length in Short + Chars). public static final int MAX_BINARY_LENGTH = DEPTH * (MAX_COMPONENT_LENGTH + 1) * 2; private final String[] elems; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java index 42beabf0..6b2b331d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java @@ -201,14 +201,33 @@ public int[] getSynonymGroupIds() { return synonymGids; } + /** + * Returns the user data string. + * + * @return the user data string. + */ public String getUserData() { return userData; } + /** + * Read StringPtr to the surface form directly. + * + * @param buffer + * @param pos + * @return + */ public static int surfaceForm(ByteBuffer buffer, int pos) { return buffer.getInt(pos + 8); } + /** + * Read StringPtr to the reading form directly. + * + * @param buffer + * @param pos + * @return + */ public static int readingForm(ByteBuffer buffer, int pos) { return buffer.getInt(pos + 12); } @@ -258,19 +277,27 @@ private WordInfo(ByteBuffer buffer, int pos) { } if (wordStructureLen == 0xff) { wordStructure = aUnitSplit; - offset += wordStructureLen * 4; + offset += wordStructureLen * 4; // here? } else { wordStructure = Ints.readArray(buffer, offset, wordStructureLen); } synonymGids = Ints.readArray(buffer, offset, synonymLen); if (userDataFlag != 0) { - userData = StringUtil.readLengthPrefixed(buffer); + userData = StringUtil.readLengthPrefixed(buffer); // offset? } else { userData = ""; } } + /** + * read WordInfo from the buffer, staring from the provided position. + * + * @param buffer + * @param pos + * position to start reading. + * @return + */ public static WordInfo read(ByteBuffer buffer, int pos) { return new WordInfo(buffer, pos); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Align.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Align.java index 407bce59..fb2feae7 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Align.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Align.java @@ -16,11 +16,20 @@ package com.worksap.nlp.sudachi.dictionary.build; +/** + * Utility class to align binary data boundary. + */ public class Align { private Align() { } + /** + * Check if given value is power of 2. + * + * @param value + * @return + */ public static boolean isPowerOf2(long value) { return (value & value - 1) == 0; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockHandler.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockHandler.java deleted file mode 100644 index 64b44aa1..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockHandler.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 2022 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build; - -import java.io.IOException; - -public interface BlockHandler { - T apply(BlockOutput output) throws IOException; -} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java index 69872b1d..eb790825 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java @@ -24,6 +24,10 @@ import java.util.List; import java.util.StringJoiner; +/** + * Output channel wrapper to write dictionary parts in block layout. Also + * provides access to the Progress. + */ public class BlockLayout { private static final long BLOCK_SIZE = 4096; private final SeekableByteChannel channel; @@ -36,6 +40,23 @@ public BlockLayout(SeekableByteChannel channel, Progress progress) throws IOExce channel.position(BLOCK_SIZE); // keep first block for the description } + /** Function that works with BlockOutput */ + public interface BlockHandler { + T apply(BlockOutput output) throws IOException; + } + + /** + * Let handler write data in block layout. + * + * @param + * return type of the handler. + * @param name + * the name for the block used as key in BlockInfo. + * @param handler + * handler that works on the channel and progress. + * @return result of the handler. + * @throws IOException + */ public T block(String name, BlockHandler handler) throws IOException { SeekableByteChannel chan = channel; long start = chan.position(); @@ -47,6 +68,11 @@ public T block(String name, BlockHandler handler) throws IOException { return result; } + /** + * Returns the summary of block written. + * + * @return block information in the Description.Block format. + */ public List blocks() { List result = new ArrayList<>(); for (BlockInfo b : info) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockOutput.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockOutput.java index 2fdd7fbd..1c45b243 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockOutput.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockOutput.java @@ -19,6 +19,9 @@ import java.io.IOException; import java.nio.channels.SeekableByteChannel; +/** + * Data class for BlockLayout.BlockHandler argument. + */ public class BlockOutput { private SeekableByteChannel chan; private Progress progress; @@ -38,6 +41,19 @@ public Progress getProgress() { return progress; } + /** + * Function decorator to measure output progress. + * + * @param + * return type of the fun + * @param name + * name for progress block. + * @param fun + * actual process to measure progress. Must take Progress as an only + * arg. + * @return + * @throws IOException + */ public T measured(String name, IOFunction fun) throws IOException { Progress p = progress; long start = chan.position(); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java index e4082e6f..25db3cef 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java @@ -21,6 +21,7 @@ import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; +/** Basic data type writer */ public class BufWriter { private final ByteBuffer buffer; @@ -49,7 +50,7 @@ public BufWriter putLong(long x) { } /** - * Envode int as LEB128 + * Encode int as LEB128 * * @param val * value to encode @@ -117,6 +118,13 @@ public void putShortString(String s) { } } + /** + * Encode string in utf8 format, with length encoded in varint. + * + * @param s + * string to put in the buffer. + * @return this + */ public BufWriter putUtf8String(String s) { byte[] bytes = s.getBytes(StandardCharsets.UTF_8); putVarint32(bytes.length); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufferedChannel.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufferedChannel.java index 939cf839..9f83e6eb 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufferedChannel.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufferedChannel.java @@ -22,10 +22,12 @@ import java.nio.CharBuffer; import java.nio.channels.WritableByteChannel; +/** Output channel wrapper with buffer. */ public class BufferedChannel { private final ByteBuffer buffer; private final WritableByteChannel channel; + // data size written to the channel. private int offset; public BufferedChannel(WritableByteChannel channel, int size) { @@ -38,15 +40,38 @@ public BufferedChannel(WritableByteChannel channel) { this(channel, 64 * 1024); } + /** + * Put string to the buffer. + * + * @param data + * @throws IOException + */ public void put(String data) throws IOException { put(data, 0, data.length()); } + /** + * Put substring to the buffer. + * + * @param data + * @param start + * @param end + * @throws IOException + */ public void put(String data, int start, int end) throws IOException { CharBuffer chars = prepare(end - start); chars.put(data, start, end); } + /** + * Obtain CharBuffer of spcified char size, flushing if necessary. + * + * Moves buffer position assuming that caller will put requested size of chars. + * + * @param numChars + * @return + * @throws IOException + */ private CharBuffer prepare(int numChars) throws IOException { int remaining = buffer.remaining(); int byteLength = numChars * 2; @@ -64,6 +89,13 @@ private CharBuffer prepare(int numChars) throws IOException { return chars; } + /** + * Obtain ByteBuffer of specified size, flushing if necessary. + * + * @param maxLength + * @return + * @throws IOException + */ public ByteBuffer byteBuffer(int maxLength) throws IOException { ByteBuffer buf = buffer; int remaining = buf.remaining(); @@ -80,21 +112,40 @@ public ByteBuffer byteBuffer(int maxLength) throws IOException { return buf; } + /** + * Obtain BufWriter of specified size, flushing if necessary. + * + * @param maxLength + * @return + * @throws IOException + */ public BufWriter writer(int maxLength) throws IOException { ByteBuffer buf = byteBuffer(maxLength); return new BufWriter(buf); } + /** + * Flush internal buffer to the channel. + * + * @throws IOException + */ public void flush() throws IOException { buffer.flip(); channel.write(buffer); buffer.clear(); } + /** @return total size of data in the buffer + written to the channel. */ public int offset() { return this.offset + buffer.position(); } + /** + * Align internal buffer position. + * + * @param alignment + * @return offset after alignment + */ public int alignTo(int alignment) { ByteBuffer buf = buffer; int pos = buf.position(); @@ -103,6 +154,7 @@ public int alignTo(int alignment) { return aligned + offset; } + /** Set position of internal buffer */ public void position(int newPosition) { buffer.position(newPosition); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java index ec7a21a6..da9d6122 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java @@ -25,6 +25,9 @@ import java.nio.charset.StandardCharsets; import java.util.regex.Pattern; +/** + * Dictionary parts: left/right id connection cost matrix. + */ public class ConnectionMatrix implements WriteDictionary { private short numLeft; private short numRight; @@ -126,6 +129,7 @@ public long readEntries(InputStream data) throws IOException { return numLines; } + /** Clear this ConnectionMatrix */ public void makeEmpty() { ByteBuffer data = ByteBuffer.allocate(4); data.order(ByteOrder.LITTLE_ENDIAN); @@ -140,18 +144,28 @@ public void writeTo(ModelOutput output) throws IOException { output.write(compiled); } + /** @return number of left id */ public short getNumLeft() { return numLeft; } + /** @return number of right id */ public short getNumRight() { return numRight; } + /** @return if this is empty */ public boolean nonEmpty() { return numLeft > 0 || numRight > 0; } + /** + * Write connection matrix to the provided block output. + * + * @param out + * @return + * @throws IOException + */ public Void compile(BlockOutput out) throws IOException { return out.measured("Connection Matrix", (p) -> { out.getChannel().write(compiled.duplicate()); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java index 904f1dac..89fca853 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java @@ -73,7 +73,7 @@ private T self() { public T lexicon(String name, IOSupplier input, long size) throws IOException { progress.startBlock(name, nanoTime(), Progress.Kind.BYTE); try (InputStream is = input.get()) { - InputStream stream = new TrackingInputStream(is); + InputStream stream = new ProgressInputStream(is, size, progress); lexicon.read(name, stream, pos); } progress.endBlock(size, nanoTime()); @@ -173,6 +173,11 @@ public void build(SeekableByteChannel channel) throws IOException { } } + /** + * System dictionary with connection matrix added. + * + * Instanciate via SystemNoMatrix. + */ public static final class System extends Base { private System readMatrix(String name, IOSupplier input, long size) throws IOException { progress.startBlock(name, nanoTime(), Progress.Kind.BYTE); @@ -280,6 +285,11 @@ public SystemNoMatrix progress(Progress progress) { } } + /** + * User dictionary. + * + * Requires system dictionary to load grammar from to initialize. + */ public static final class User extends Base { private User(DictionaryAccess system) { pos.preloadFrom(system.getGrammar()); @@ -308,6 +318,7 @@ public static User user(DictionaryAccess system) { return new User(system); } + /** entry point to test Base build with single lexicon (first arg). */ public static void main(String[] args) throws IOException { Base b = new Base<>(); Path input = Paths.get(args[0]); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java index 28d6237d..09f7f16b 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java @@ -27,7 +27,14 @@ import java.util.*; /** - * Dictionary Parts: Trie index and entry offsets + * Dictionary Parts: Trie index and corresponding word id table. + * + * TRIE maps headwords to offset for WordIdTable. WordIdTable contains the list + * of word-ids of words which have the target headword. WordId here means offset + * in WordEntryTable (with last n bits dropped). + * + * WordIdTable also contins word-ids that are not indexed in TRIE, so that we + * can iterate over all word entries. */ public class Index implements WriteDictionary { private final SortedMap elements = new TreeMap<>((byte[] l, byte[] r) -> { @@ -43,6 +50,13 @@ public class Index implements WriteDictionary { private int count = 0; + /** + * Add a (headword, wordid) pair to the index + * + * @param key + * @param wordId + * @return + */ public int add(String key, int wordId) { byte[] bytes = key.getBytes(StandardCharsets.UTF_8); Ints entries = elements.computeIfAbsent(bytes, k -> new Ints(4)); @@ -95,6 +109,13 @@ public void writeTo(ModelOutput output) throws IOException { output.write(wordIdTable); } + /** + * Write word id table and trie to the provided block layout. + * + * @param layout + * @param notIndexed + * @throws IOException + */ public void compile(BlockLayout layout, List notIndexed) throws IOException { TrieData data = layout.block(Blocks.WORD_POINTERS, (o) -> writeWordTable(o, notIndexed)); layout.block(Blocks.TRIE_INDEX, data::writeTrie); @@ -107,6 +128,9 @@ private TrieData writeWordTable(BlockOutput out, List n BufferedChannel buffer = new BufferedChannel(out.getChannel(), Math.max((notIndexed.size() + 16) * 5, 64 * 1024)); + int nis = notIndexed.size(); + int fullsize = size + nis; + out.measured("Word Id table", (p) -> { int i = 0; for (Map.Entry entry : this.elements.entrySet()) { @@ -124,10 +148,10 @@ private TrieData writeWordTable(BlockOutput out, List n buf.putVarint32(wid - prevWid); prevWid = wid; } - p.progress(i, size); + p.progress(i, fullsize); } - // write non-indexed entries for being able to iterate over all word entries - int nis = notIndexed.size(); + + // write non-indexed entries BufWriter buf = buffer.writer((nis + 1) * 5); buf.putVarint32(nis); int prevId = 0; @@ -135,17 +159,22 @@ private TrieData writeWordTable(BlockOutput out, List n int wid = e.pointer(); buf.putVarint32(wid - prevId); prevId = wid; + p.progress(++i, fullsize); } return null; }); - buffer.flush(); return new TrieData(keys, values); } + /** + * Subclass for trie construction. + */ private static class TrieData { + // headwords added to this index private final byte[][] keys; + // offsets to WordIdTable private final int[] values; public TrieData(byte[][] keys, int[] values) { @@ -153,6 +182,13 @@ public TrieData(byte[][] keys, int[] values) { this.values = values; } + /** + * Write trie to the provided block output. + * + * @param block + * @return + * @throws IOException + */ public Void writeTrie(BlockOutput block) throws IOException { return block.measured("Trie Index", (p) -> { DoubleArray trie = new DoubleArray(); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java index 739209e7..9fa1532a 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java @@ -21,6 +21,9 @@ import java.util.List; import java.util.Map; +/** + * Utility to look up entries from the list. + */ public class Lookup2 { public interface Entry { int pointer(); @@ -41,16 +44,35 @@ public Lookup2(List entries) { } private final List entries; + // mapping to entries that have same surfaces private final Map> bySurface; + /** + * Lookup an entry by the list index. Make sure you know the order of entries in + * the list. + * + * @param index + * @return + */ public Entry byIndex(int index) { return entries.get(index); } + /** + * Lookup entries by the headword (surface). + * + * @param headword + * @return + */ public List byHeadword(String headword) { return bySurface.get(headword); } + /** + * Add an entry for headword search. + * + * @param e + */ public void add(Entry e) { bySurface.computeIfAbsent(e.headword(), x -> new ArrayList<>()).add(e); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java index 3c6d2eed..0292e8f7 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java @@ -24,11 +24,20 @@ import java.util.HashMap; import java.util.List; +/** + * Dictionary parts: List of part-of-speeches. + */ public class POSTable implements WriteDictionary { private final List table = new ArrayList<>(); private final HashMap lookup = new HashMap<>(); private int builtin = 0; + /** + * Returns the id of given POS, updating table if it's not in the list. + * + * @param s + * @return + */ short getId(POS s) { return lookup.computeIfAbsent(s, p -> { int next = table.size(); @@ -40,6 +49,12 @@ short getId(POS s) { }); } + /** + * Load pos table from the grammar (of the system dictionary). They are + * considered as built-in pos. + * + * @param grammar + */ public void preloadFrom(Grammar grammar) { int partOfSpeechSize = grammar.getPartOfSpeechSize(); for (short i = 0; i < partOfSpeechSize; ++i) { @@ -50,6 +65,7 @@ public void preloadFrom(Grammar grammar) { builtin += partOfSpeechSize; } + /** @return full POS list that contains builtin and newly added POSs */ List getList() { return table; } @@ -72,10 +88,20 @@ public void writeTo(ModelOutput output) throws IOException { }); } + /** + * @return number of non-builtin POSs. + */ public int ownedLength() { return table.size() - builtin; } + /** + * Write pos table to the provided block output. + * + * @param out + * @return + * @throws IOException + */ public Void compile(BlockOutput out) throws IOException { return out.measured("POS Table", (p) -> { BufferedChannel cbuf = new BufferedChannel(out.getChannel()); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java index 784d2719..d7ef6ed4 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java @@ -18,14 +18,22 @@ import java.time.Duration; +/** + * Handles progress of each build process. + */ public class Progress { + // minimum time delta for callback.progress call private final static long MS_100 = 100_000_000L; // 100ms in nanos + // resolution of progress step. private final int maxUpdates; private final Callback callback; private float currentProgress; + // records the nano time of startBlock call private long startTime; + // records the nano time of last callback.progress call private long lastUpdate; + /** Progress with no-operation. */ public static final Progress NOOP = new Progress(1, progress -> { }); @@ -34,6 +42,16 @@ public Progress(int maxUpdates, Callback callback) { this.callback = callback; } + /** + * declare the start of progress block + * + * @param name + * name of this block + * @param start + * nano time when the process starts + * @param kind + * what kind of data will be processed. + */ public void startBlock(String name, long start, Kind kind) { startTime = start; lastUpdate = start; @@ -73,10 +91,21 @@ public void progress(long cur, long max) { } } + /** + * declare the end of progress block + * + * @param size + * actual size of processed data. + * @param time + * nano time when the process ends. + */ public void endBlock(long size, long time) { callback.end(size, Duration.ofNanos(time - startTime)); } + /** + * What kind of data will be processed. + */ public enum Kind { BYTE, ENTRY } @@ -86,7 +115,7 @@ public enum Kind { */ public interface Callback { /** - * This function will be called for each step at the beginning + * This function will be called at the beginning of each block. * * @param name * step name @@ -102,6 +131,12 @@ default void start(String name, Kind kind) { */ void progress(float progress); + /** + * This function will be called at the end of each block + * + * @param size + * @param time + */ default void end(long size, Duration time) { } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ProgressInputStream.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ProgressInputStream.java index 37087327..a31f0560 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ProgressInputStream.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ProgressInputStream.java @@ -19,6 +19,11 @@ import java.io.IOException; import java.io.InputStream; +/** + * Input stream with progress. + * + * Reading from this automaticaly calls Progress.progress. + */ public class ProgressInputStream extends InputStream { private final InputStream inner; private long position = 0; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index 43aeace6..cf4545d4 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -29,23 +29,47 @@ import java.util.ArrayList; import java.util.List; +/** + * Dictionary part: Lexicon loaded from csv files. + */ public class RawLexicon { + // word id must be integer size. + // However, current implementation (ByteBuffer) cannot handle offset larger than + // Integer.MAX_VALUE. private static final long MAX_OFFSET = (long) Integer.MAX_VALUE * WordInfoList.OFFSET_ALIGNMENT; + // first private static final int INITIAL_OFFSET = 32; private final StringStorage strings = new StringStorage(); private final List entries = new ArrayList<>(); private final List notIndexed = new ArrayList<>(); private final Index index = new Index(); - private boolean user; + private boolean user = false; + // offset for next entry private long offset = INITIAL_OFFSET; private boolean runtimeCosts = false; + /** + * Read lexicon from InputStream. + * + * @param name + * @param data + * @param posTable + * @throws IOException + */ public void read(String name, InputStream data, POSTable posTable) throws IOException { read(name, new InputStreamReader(data, StandardCharsets.UTF_8), posTable); } + /** + * Read lexicon from Reader. + * + * @param name + * @param data + * @param posTable + * @throws IOException + */ public void read(String name, Reader data, POSTable posTable) throws IOException { CSVParser parser = new CSVParser(data); parser.setName(name); @@ -69,10 +93,17 @@ public void read(String name, Reader data, POSTable posTable) throws IOException this.offset = offset; } + /** + * Convert offset to pointer (word id) + * + * @param offset + * @return + */ public static int pointer(long offset) { return WordInfoList.offset2wordId(offset); } + /** check if the current offset is valid */ public void checkOffset(long offset) { if ((offset & 0x7) != 0) { throw new IllegalArgumentException("offset is not aligned, should not happen"); @@ -82,8 +113,16 @@ public void checkOffset(long offset) { } } + /** + * Write lexicon to the provided block layout. + * + * @param pos + * @param layout + * @throws IOException + */ public void compile(POSTable pos, BlockLayout layout) throws IOException { index.compile(layout, notIndexed); + // entry layout requires stringstorage to be compiled beforehand. layout.block(Blocks.STRINGS, this::writeStrings); layout.block(Blocks.ENTRIES, (p) -> writeEntries(pos, p)); } @@ -103,6 +142,7 @@ private Void writeEntries(POSTable pos, BlockOutput blockOutput) throws IOExcept if (e.pointer != ptr) { throw new IllegalStateException("expected entry pointer != actual pointer, i=" + i); } + // size may increases with phantom entry size += e.addPhantomEntries(list, lookup); ptr = layout.put(e); p.progress(i, size); @@ -120,14 +160,17 @@ private Void writeStrings(BlockOutput blockOutput) throws IOException { }); } + /** @return number of entries in the TRIE index */ public int getIndexedEntries() { return this.entries.size() - this.notIndexed.size(); } + /** @return number of all entries including non-indexed ones */ public int getTotalEntries() { return this.entries.size(); } + /** @return if lexicon has entries that need runtime cost caluculation */ public boolean hasRuntimeCosts() { return this.runtimeCosts; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index fe40c7ee..8ef3c705 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -26,6 +26,9 @@ import java.util.StringJoiner; import java.util.regex.Pattern; +/** + * Reader for the lexicon csv file. + */ public class RawLexiconReader { /** @@ -60,10 +63,12 @@ public RawLexiconReader(CSVParser parser, POSTable pos, boolean user) throws IOE private static final Pattern INTEGER_REGEX = Pattern.compile("^-?\\d+$"); + /** assume legacy column layout if header line is not present */ private boolean isLegacyColumnLayout() { return mapping == null; } + /** resolve header line and set to mapping if it exists. */ private void resolveColumnLayout() throws IOException { List record = parser.getNextRecord(); @@ -101,6 +106,7 @@ private void resolveColumnLayout() throws IOException { this.mapping = mapping; } + /** parse specified column as string */ private String get(List data, Column column, boolean unescape) { int index = column.ordinal(); if (mapping != null) { @@ -122,6 +128,7 @@ private String get(List data, Column column, boolean unescape) { } } + /** parse specified column as short */ private short getShort(List data, Column column) { String value = get(data, column, false); try { @@ -132,6 +139,7 @@ private short getShort(List data, Column column) { } } + /** convert csv row to RawWordEntry */ private RawWordEntry convertEntry(List data) { RawWordEntry entry = new RawWordEntry(); entry.headword = get(data, Column.Surface, true); @@ -163,6 +171,7 @@ private RawWordEntry convertEntry(List data) { return entry; } + /** @return next entry parsed */ public RawWordEntry nextEntry() throws IOException { List record = cachedRecord; if (record != null) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java index 41970ca4..45a3ed11 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java @@ -24,10 +24,13 @@ import java.util.List; import java.util.Objects; +/** + * Raw word info entry parsed from the lexicon csv. + */ @SuppressWarnings("jol") public class RawWordEntry implements Lookup2.Entry { WordInfo wordInfo; - int pointer; + int pointer; // wordid, compressed offset of this entry in the lexicon.WordEntries String headword; String reading; WordRef normalizedForm; @@ -111,11 +114,18 @@ private void checkString(String value, String name) { } } + /** check if sudachi dictionary can handle this entry */ public void validate() { checkString(headword, "headword"); checkString(reading, "reading"); } + /** + * add necessary strings into the string storage. + * + * @param strings + * storage to publish strings. + */ public void publishStrings(StringStorage strings) { strings.add(headword); strings.add(reading); @@ -125,6 +135,13 @@ public void publishStrings(StringStorage strings) { } } + /** + * Add surface-only entry to access via normalized_form reference if necessary. + * + * @param list + * @param lookup + * @return 1 if phantom entry added, 0 otherwise + */ public int addPhantomEntries(List list, Lookup2 lookup) { if (normalizedForm instanceof WordRef.Headword) { WordRef.Headword ref = (WordRef.Headword) normalizedForm; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ResizableBuffer.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ResizableBuffer.java index 38285dba..aa8f2e70 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ResizableBuffer.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ResizableBuffer.java @@ -21,6 +21,7 @@ import java.nio.ByteOrder; import java.nio.channels.WritableByteChannel; +/** Resizable byte buffer wrapper. */ public class ResizableBuffer { private ByteBuffer buffer; @@ -30,6 +31,7 @@ public ResizableBuffer(int capacity) { buffer = buf; } + /** make sure the buffer has enough capacity for specified size. */ public ByteBuffer prepare(int additional) { ByteBuffer buf = buffer; if (buf.remaining() >= additional) { @@ -39,6 +41,7 @@ public ByteBuffer prepare(int additional) { } } + /** make sure the buffer has enough capacity from offset for specified size. */ public ByteBuffer prepare(int offset, int size) { ByteBuffer buf = buffer; int capacity = buf.capacity(); @@ -63,10 +66,12 @@ private ByteBuffer grow(int additional) { return fresh; } + /** @return internal buffer */ public ByteBuffer getBuffer() { return buffer; } + /** write specified range of the buffer to the channel */ public void write(WritableByteChannel channel, int start, int end) throws IOException { ByteBuffer buf = buffer; int pos = buf.position(); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java index 7c1c50b7..ce7d97fa 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java @@ -29,19 +29,36 @@ import java.nio.file.StandardOpenOption; import java.util.*; +/** + * Dictionary parts: storage of strings used in the lexicons. + */ public class StringStorage implements StringIndex { + // strings required by lexicons private final HashMap strings = new HashMap<>(); private final HashMap candidates = new HashMap<>(); + // compacted strings layout private final WordLayout layout = new WordLayout(); + /** + * Add string to the storage. + * + * @param data + */ void add(String data) { strings.put(data, null); } + /** + * Compile added strings. should only call once after all strings are added and + * before use. + * + * @param progress + */ void compile(Progress progress) { candidates.clear(); candidates.put("", new Item("", 0, 0)); List collect = new ArrayList<>(strings.keySet()); + // sort strings so that processing works correctly collect.sort(Comparator.comparingInt(String::length).reversed().thenComparing(String::compareTo)); int size = collect.size(); for (int i = 0; i < size; ++i) { @@ -54,9 +71,10 @@ void compile(Progress progress) { candidates.clear(); } + // layout string and returns Item private Item process(String str) { Item present = candidates.get(str); - if (present != null) { + if (present != null) { // this str is a substring of previous one. return present; } @@ -64,12 +82,13 @@ private Item process(String str) { int[] offsets = new int[length + 1]; int numOffsets = computeOffsets(str, offsets); - StringPtr ptr = layout.add(str, 0, length); + StringPtr ptr = layout.add(str); Item full = new Item(str, 0, length); full.root = full; full.ptr = ptr; candidates.put(str, full); + // handle substrings for (int i = 0; i < numOffsets; ++i) { int start = offsets[i]; for (int j = i + 1; j <= numOffsets; ++j) { @@ -77,7 +96,8 @@ private Item process(String str) { String sub = str.substring(start, end); // Create a possible substring only if // 1. It does not exist yet - // 2. Can form a valid pointer to it + // 2. Can form a valid pointer to it (string pointer requires aligned offset + // based on str length) if (!candidates.containsKey(sub) && ptr.isSubseqValid(start, end)) { Item item = new Item(str, start, end); item.root = full; @@ -89,6 +109,8 @@ private Item process(String str) { return full; } + // compute char offset for each codepoint in the str. + // @return number of code points. private int computeOffsets(String str, int[] offsets) { int count = 0; int len = str.length(); @@ -106,19 +128,33 @@ private int computeOffsets(String str, int[] offsets) { return count; } + /** @return StringPtr for the string */ public StringPtr resolve(String data) { Item item = strings.get(data); return item.root.ptr.subPtr(item.start, item.end); } + /** @return string hash map */ public HashMap getStrings() { return strings; } + /** + * Write compacted string storage to the provided channel + * + * @param channel + * @throws IOException + */ public void writeCompact(WritableByteChannel channel) throws IOException { layout.write(channel); } + /** + * legacy string compilation. only for comparison purpose. + * + * @param channel + * @throws IOException + */ public void writeLengthPrefixedCompact(SeekableByteChannel channel) throws IOException { DicBuffer buf = new DicBuffer(64 * 1024); for (Map.Entry item : strings.entrySet()) { @@ -132,11 +168,18 @@ public void writeLengthPrefixedCompact(SeekableByteChannel channel) throws IOExc buf.consume(channel::write); } + /** + * Data class of string and its pointer. + */ public static class Item { + // super-string that contains this string private final String data; + // substring range of this string in data private final int start; private final int end; + // root to get the pointer from private Item root; + // pointer to data in the storage private StringPtr ptr; public Item(String data, int start, int end) { @@ -162,6 +205,12 @@ public int getLength() { } } + /** + * Save strings in the lexicon csv (first arg) with legacy/compressed format + * with given name (second arg). + * + * Use this to compare output size of each format. + */ public static void main(String[] args) throws IOException { StringStorage strings = new StringStorage(); try (BufferedReader reader = Files.newBufferedReader(Paths.get(args[0]))) { @@ -173,6 +222,7 @@ public static void main(String[] args) throws IOException { strings.add(record.get(11)); strings.add(record.get(12)); } + parser.close(); } strings.compile(null); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Unescape.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Unescape.java index a120ad7e..1f157ac9 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Unescape.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Unescape.java @@ -19,6 +19,9 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +/** + * Utility to un-escape string from csv. + */ public class Unescape { private static final Pattern unicodeLiteral = Pattern.compile("\\\\u(?:[0-9a-fA-F]{4}|\\{[0-9a-fA-F]+})"); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java index 6a44310c..cac2b388 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java @@ -23,6 +23,7 @@ import java.nio.channels.SeekableByteChannel; import java.nio.channels.WritableByteChannel; +/** Resizable byte buffer to store string */ public class UnicodeBufferResizeable { private ResizableBuffer buffer; @@ -34,6 +35,7 @@ public UnicodeBufferResizeable() { this(64 * 1024); } + /** put specified (char) range of the string to the buffer from offset */ public void put(int offset, String data, int start, int end) { CharBuffer chars = prepare(offset, end - start); chars.put(data, start, end); @@ -44,6 +46,7 @@ private CharBuffer prepare(int offset, int numChars) { return buf.asCharBuffer(); } + /** write specified (byte) range of the buffer to the channel */ public void write(WritableByteChannel channel, int start, int end) throws IOException { buffer.write(channel, start, end); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java index eef68b8d..c6146cc9 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java @@ -25,6 +25,9 @@ import java.nio.ByteOrder; import java.util.List; +/** + * Output channel wrapper to write word entries. + */ public class WordEntryLayout { private final StringIndex index; private final WordRef.Parser wordRefParser; @@ -36,7 +39,7 @@ public class WordEntryLayout { private final Ints wordStructure = new Ints(16); private final Ints synonymGroups = new Ints(16); - public static final int MAX_LENGTH = 32 // basic size + public static final int MAX_LENGTH = 32 // minimum size + Byte.MAX_VALUE * Integer.BYTES * 5 // splits and synonyms + (Short.MAX_VALUE + 1) * Character.BYTES; // user data @@ -47,13 +50,22 @@ public WordEntryLayout(Lookup2 resolver, StringIndex index, WordRef.Parser parse this.buffer = buffer; } + /** + * Write word entry into output and returns next offset. + * + * @param entry + * @return + * @throws IOException + */ public int put(RawWordEntry entry) throws IOException { BufWriter buf = this.buffer.writer(MAX_LENGTH); + buf.putShort(entry.leftId); buf.putShort(entry.rightId); buf.putShort(entry.cost); buf.putShort(entry.posId); - // 8 bytes + // 2*4 = 8 bytes + buf.putInt(index.resolve(entry.headword).encode()); // surfacePtr buf.putInt(index.resolve(entry.reading).encode()); // readingPtr int normFormPtr = 0; @@ -66,32 +78,31 @@ public int put(RawWordEntry entry) throws IOException { } buf.putInt(normFormPtr); // normalized entry buf.putInt(dicFormPtr); // dictionary form - // 8 + 16 = 24 bytes + // 8 + 4*4 = 24 bytes + // length can't be more than ~4k utf-16 code units so the cast is safe + short utf8Len = (short) StringUtil.countUtf8Bytes(entry.headword); byte cSplitLen = parseList(entry.cUnitSplitString, "", cSplits); byte bSplitLen = parseList(entry.bUnitSplitString, entry.cUnitSplitString, bSplits); byte aSplitLen = parseList(entry.aUnitSplitString, entry.bUnitSplitString, aSplits); byte wordStructureLen = parseList(entry.wordStructureString, entry.aUnitSplitString, wordStructure); byte synonymLen = parseIntList(entry.synonymGroups, synonymGroups); - - // length can't be more than ~4k utf-16 code units so the cast is safe - short utf8Len = (short) StringUtil.countUtf8Bytes(entry.headword); + int userDataLength = entry.userData.length(); buf.putShort(utf8Len); buf.putByte(cSplitLen); buf.putByte(bSplitLen); buf.putByte(aSplitLen); buf.putByte(wordStructureLen); buf.putByte(synonymLen); - int userDataLength = entry.userData.length(); buf.putByte(userDataLength == 0 ? (byte) 0 : (byte) 1); // 24 + 8 = 32 bytes + // putInts is no-op if length <= 0 buf.putInts(cSplits, cSplitLen); buf.putInts(bSplits, bSplitLen); buf.putInts(aSplits, aSplitLen); buf.putInts(wordStructure, wordStructureLen); buf.putInts(synonymGroups, synonymLen); - if (userDataLength != 0) { buf.putShort((short) userDataLength); String userData = entry.userData; @@ -104,6 +115,7 @@ public int put(RawWordEntry entry) throws IOException { return RawLexicon.pointer(position); } + /** parse int list, i.e. synonym group ids */ private byte parseIntList(String data, Ints result) { if (data == null || data.isEmpty() || "*".equals(data)) { result.clear(); @@ -120,6 +132,7 @@ private byte parseIntList(String data, Ints result) { return (byte) parts.length; } + /** parse word ref list, i.e. A/B/C split and word structure */ byte parseList(String data, String reference, Ints result) { if (data == null || data.isEmpty() || "*".equals(data)) { result.clear(); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java index 513fc92e..afcf312b 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java @@ -51,15 +51,12 @@ public class WordLayout { private int pointer; private int maxLength = -1; + /** Locates string and returns the pointer to that. */ public StringPtr add(String string) { - return add(string, 0, string.length()); - } - - public StringPtr add(String string, int start, int end) { int length = string.length(); int alignment = StringPtr.requiredAlignment(length); int offset = allocate(length, alignment); - buffer.put(offset, string, start, end); + buffer.put(offset, string, 0, string.length()); return StringPtr.checked(length, offset); } @@ -213,10 +210,17 @@ private int computeNewMaxLength(int index) { } } + /** + * Write layouted strings to the provided channel. + * + * @param channel + * @throws IOException + */ public void write(WritableByteChannel channel) throws IOException { buffer.write(channel, 0, pointer * 2); } + /** Represents empty ranges where strings are not placed yet. */ public static class FreeSpace implements Comparable { int start; int length; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java index b8239118..8697bdf1 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java @@ -25,11 +25,14 @@ import java.util.regex.Pattern; /** - * Reference to a word in the CSV dictionary. + * Reference to a word in the lexicon csv. */ public abstract class WordRef { public abstract int resolve(Lookup2 resolver); + /** + * Reference written by line number of the lexicon csv file. + */ public static final class LineNo extends WordRef { private final int line; @@ -52,6 +55,9 @@ public String toString() { } } + /** + * Reference written by surface. + */ public static final class Headword extends WordRef { private final String headword; @@ -75,6 +81,9 @@ public String toString() { } } + /** + * Reference written by surface-pos-reading tuple. + */ public static final class Triple extends WordRef { private final String headword; private final short posId; @@ -117,10 +126,12 @@ public String toString() { private static final Pattern NUMERIC_RE = Pattern.compile("^U?\\d+$"); + /** Alias of WordRef.Parser constructor. */ public static Parser parser(POSTable posTable, boolean allowNumeric, boolean allowHeadword) { return new Parser(posTable, allowNumeric, allowHeadword); } + /** Parser to parse wordref from a string in the lexicon field. */ public static class Parser { private final POSTable posTable; private final boolean allowNumeric; @@ -132,6 +143,7 @@ public Parser(POSTable posTable, boolean allowNumeric, boolean allowHeadword) { this.allowHeadword = allowHeadword; } + /** @return WordRef parsed from the text. */ public WordRef parse(String text) { if ("*".equals(text) || text == null || text.isEmpty()) { return null; From 42c620cf25a01d2bc9a6f71b3c46d2400ce4f266 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 19 Jul 2024 13:32:53 +0900 Subject: [PATCH 24/94] fix wordinfo parse (word structure ~) --- .../com/worksap/nlp/sudachi/StringUtil.java | 25 ++++++++++++++++--- .../nlp/sudachi/dictionary/WordInfo.java | 8 ++++-- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/StringUtil.java b/src/main/java/com/worksap/nlp/sudachi/StringUtil.java index da39bbf2..36e56530 100644 --- a/src/main/java/com/worksap/nlp/sudachi/StringUtil.java +++ b/src/main/java/com/worksap/nlp/sudachi/StringUtil.java @@ -96,16 +96,24 @@ public static int count(CharSequence sequence, int start, int end, char toFind) return count; } - public static String readLengthPrefixed(ByteBuffer buffer) { + /** + * Read string encoded by length in short + chars format + * (BufWriter.putShortString). + * + * @param buffer + * @param offset + * @return string read + */ + public static String readLengthPrefixed(ByteBuffer buffer, int offset) { // implementation: use the fact that CharBuffers are CharSequences // and the fact that ByteBuffer can be used as CharBuffer // remember buffer state int limit = buffer.limit(); int position = buffer.position(); // read length - short length = buffer.getShort(position); + short length = buffer.getShort(offset); // compute new buffer state - int newPosition = position + 2; + int newPosition = offset + 2; buffer.position(newPosition); buffer.limit(newPosition + length * 2); // use CharBuffer API @@ -116,6 +124,17 @@ public static String readLengthPrefixed(ByteBuffer buffer) { return result; } + /** + * Read string encoded by length in short + chars format + * (BufWriter.putShortString). + * + * @param buffer + * @return string read + */ + public static String readLengthPrefixed(ByteBuffer buffer) { + return readLengthPrefixed(buffer, buffer.position()); + } + public static int countUtf8Bytes(CharSequence seq) { return countUtf8Bytes(seq, 0, seq.length()); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java index 6b2b331d..29455095 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java @@ -238,10 +238,12 @@ private WordInfo(ByteBuffer buffer, int pos) { // short cost = buffer.getShort(pos + 4); // do not modify buffer metadata for better performance posId = buffer.getShort(pos + 6); + surface = surfaceForm(buffer, pos); // +8 reading = readingForm(buffer, pos); // +12 normalizedForm = buffer.getInt(pos + 16); dictionaryForm = buffer.getInt(pos + 20); + long rest = buffer.getLong(pos + 24); headwordLength = (short) (rest & 0xffff); rest >>>= 16; @@ -260,6 +262,7 @@ private WordInfo(ByteBuffer buffer, int pos) { int wordStructureLen = (int) ((rest >>> 24) & 0xff); int synonymLen = (int) ((rest >>> 32) & 0xff); int userDataFlag = (int) ((rest >>> 40) & 0xff); + int offset = pos + 32; cUnitSplit = Ints.readArray(buffer, offset, cSplitLen); offset += cSplitLen * 4; @@ -277,14 +280,15 @@ private WordInfo(ByteBuffer buffer, int pos) { } if (wordStructureLen == 0xff) { wordStructure = aUnitSplit; - offset += wordStructureLen * 4; // here? } else { wordStructure = Ints.readArray(buffer, offset, wordStructureLen); + offset += wordStructureLen * 4; } synonymGids = Ints.readArray(buffer, offset, synonymLen); + offset += synonymLen * 4; if (userDataFlag != 0) { - userData = StringUtil.readLengthPrefixed(buffer); // offset? + userData = StringUtil.readLengthPrefixed(buffer, offset); } else { userData = ""; } From e1cdc0c7011d5197c88a75d1617675969e982754 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 22 Jul 2024 10:40:38 +0900 Subject: [PATCH 25/94] fix lexicon wordref parsing --- .../nlp/sudachi/dictionary/WordInfo.java | 1 + .../sudachi/dictionary/build/RawLexicon.java | 4 +-- .../dictionary/build/RawLexiconReader.java | 15 ++++++++--- .../dictionary/build/RawWordEntry.java | 1 - .../dictionary/build/WordEntryLayout.java | 26 +++++++++++++------ .../nlp/sudachi/dictionary/build/WordRef.java | 18 ++++++------- 6 files changed, 41 insertions(+), 24 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java index 29455095..05311a47 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java @@ -232,6 +232,7 @@ public static int readingForm(ByteBuffer buffer, int pos) { return buffer.getInt(pos + 12); } + // see dictionary.build.WordEntryLayout private WordInfo(ByteBuffer buffer, int pos) { // short leftId = buffer.getShort(pos); // short rightId = buffer.getShort(pos + 2); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index cf4545d4..841bd1a6 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -37,7 +37,7 @@ public class RawLexicon { // However, current implementation (ByteBuffer) cannot handle offset larger than // Integer.MAX_VALUE. private static final long MAX_OFFSET = (long) Integer.MAX_VALUE * WordInfoList.OFFSET_ALIGNMENT; - // first + // put empty entry at the first private static final int INITIAL_OFFSET = 32; private final StringStorage strings = new StringStorage(); private final List entries = new ArrayList<>(); @@ -131,7 +131,7 @@ private Void writeEntries(POSTable pos, BlockOutput blockOutput) throws IOExcept return blockOutput.measured("Word Entries", (p) -> { List list = entries; Lookup2 lookup = new Lookup2(list); - WordRef.Parser refParser = WordRef.parser(pos, !user, false); + WordRef.Parser refParser = WordRef.parser(pos, !user, false, false); BufferedChannel buf = new BufferedChannel(blockOutput.getChannel(), WordEntryLayout.MAX_LENGTH * 4); buf.position(INITIAL_OFFSET); WordEntryLayout layout = new WordEntryLayout(lookup, strings, refParser, buf); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index 8ef3c705..ee146b08 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -52,13 +52,20 @@ public enum Column { private int[] mapping; private final CSVParser parser; private final POSTable posTable; - private final WordRef.Parser refParser; + private final WordRef.Parser normRefParser; // for normalized form + private final WordRef.Parser dictRefParser; // for dictionary form public RawLexiconReader(CSVParser parser, POSTable pos, boolean user) throws IOException { this.parser = parser; this.posTable = pos; resolveColumnLayout(); - refParser = WordRef.parser(pos, isLegacyColumnLayout() || !user, isLegacyColumnLayout()); + if (isLegacyColumnLayout()) { + normRefParser = WordRef.parser(pos, false, true, false); + dictRefParser = WordRef.parser(pos, true, true, true); + } else { + normRefParser = WordRef.parser(pos, false, false, false); + dictRefParser = WordRef.parser(pos, !user, false, false); + } } private static final Pattern INTEGER_REGEX = Pattern.compile("^-?\\d+$"); @@ -148,8 +155,8 @@ private RawWordEntry convertEntry(List data) { entry.cost = getShort(data, Column.Cost); entry.reading = get(data, Column.ReadingForm, true); - entry.dictionaryForm = refParser.parse(get(data, Column.DictionaryForm, false)); - entry.normalizedForm = refParser.parse(get(data, Column.NormalizedForm, false)); + entry.normalizedForm = normRefParser.parse(get(data, Column.NormalizedForm, false)); + entry.dictionaryForm = dictRefParser.parse(get(data, Column.DictionaryForm, false)); POS pos = new POS( // comment for line break diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java index 45a3ed11..16a74242 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java @@ -29,7 +29,6 @@ */ @SuppressWarnings("jol") public class RawWordEntry implements Lookup2.Entry { - WordInfo wordInfo; int pointer; // wordid, compressed offset of this entry in the lexicon.WordEntries String headword; String reading; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java index c6146cc9..19d02efe 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java @@ -68,11 +68,11 @@ public int put(RawWordEntry entry) throws IOException { buf.putInt(index.resolve(entry.headword).encode()); // surfacePtr buf.putInt(index.resolve(entry.reading).encode()); // readingPtr - int normFormPtr = 0; + int normFormPtr = entry.pointer; if (entry.normalizedForm != null) { normFormPtr = entry.normalizedForm.resolve(lookup); } - int dicFormPtr = 0; + int dicFormPtr = entry.pointer; if (entry.dictionaryForm != null) { dicFormPtr = entry.dictionaryForm.resolve(lookup); } @@ -82,10 +82,10 @@ public int put(RawWordEntry entry) throws IOException { // length can't be more than ~4k utf-16 code units so the cast is safe short utf8Len = (short) StringUtil.countUtf8Bytes(entry.headword); - byte cSplitLen = parseList(entry.cUnitSplitString, "", cSplits); - byte bSplitLen = parseList(entry.bUnitSplitString, entry.cUnitSplitString, bSplits); - byte aSplitLen = parseList(entry.aUnitSplitString, entry.bUnitSplitString, aSplits); - byte wordStructureLen = parseList(entry.wordStructureString, entry.aUnitSplitString, wordStructure); + byte cSplitLen = parseWordRefList(entry.cUnitSplitString, "", cSplits); + byte bSplitLen = parseWordRefList(entry.bUnitSplitString, entry.cUnitSplitString, bSplits); + byte aSplitLen = parseWordRefList(entry.aUnitSplitString, entry.bUnitSplitString, aSplits); + byte wordStructureLen = parseWordRefList(entry.wordStructureString, entry.aUnitSplitString, wordStructure); byte synonymLen = parseIntList(entry.synonymGroups, synonymGroups); int userDataLength = entry.userData.length(); buf.putShort(utf8Len); @@ -132,8 +132,17 @@ private byte parseIntList(String data, Ints result) { return (byte) parts.length; } - /** parse word ref list, i.e. A/B/C split and word structure */ - byte parseList(String data, String reference, Ints result) { + /** + * Parse word ref list, i.e. A/B/C split and word structure. + * + * If it is equivalent to the reference, return -1 without parsing. + * + * @param data + * @param reference + * @param result + * @return + */ + byte parseWordRefList(String data, String reference, Ints result) { if (data == null || data.isEmpty() || "*".equals(data)) { result.clear(); return 0; @@ -142,6 +151,7 @@ byte parseList(String data, String reference, Ints result) { result.clear(); return -1; } + String[] parts = data.split("/"); if (parts.length > Byte.MAX_VALUE) { throw new IllegalArgumentException("reference list contained more than 127 entries: " + data); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java index 8697bdf1..04d5289e 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java @@ -28,6 +28,7 @@ * Reference to a word in the lexicon csv. */ public abstract class WordRef { + /** resolve word ref into pointer (word id) using resolver. */ public abstract int resolve(Lookup2 resolver); /** @@ -127,8 +128,9 @@ public String toString() { private static final Pattern NUMERIC_RE = Pattern.compile("^U?\\d+$"); /** Alias of WordRef.Parser constructor. */ - public static Parser parser(POSTable posTable, boolean allowNumeric, boolean allowHeadword) { - return new Parser(posTable, allowNumeric, allowHeadword); + public static Parser parser(POSTable posTable, boolean allowNumeric, boolean allowHeadword, + boolean allowNullAsterisk) { + return new Parser(posTable, allowNumeric, allowHeadword, allowNullAsterisk); } /** Parser to parse wordref from a string in the lexicon field. */ @@ -136,24 +138,22 @@ public static class Parser { private final POSTable posTable; private final boolean allowNumeric; private final boolean allowHeadword; + private final boolean allowNullAsterisk; - public Parser(POSTable posTable, boolean allowNumeric, boolean allowHeadword) { + public Parser(POSTable posTable, boolean allowNumeric, boolean allowHeadword, boolean allowNullAsterisk) { this.posTable = posTable; this.allowNumeric = allowNumeric; this.allowHeadword = allowHeadword; + this.allowNullAsterisk = allowNullAsterisk; } /** @return WordRef parsed from the text. */ public WordRef parse(String text) { - if ("*".equals(text) || text == null || text.isEmpty()) { + if (text == null || text.isEmpty() || (allowNullAsterisk && "*".equals(text))) { return null; } - if (NUMERIC_RE.matcher(text).matches()) { - if (!allowNumeric) { - throw new CsvFieldException( - String.format("invalid word reference: %s, numeric references are not supported", text)); - } + if (allowNumeric && NUMERIC_RE.matcher(text).matches()) { int offset = text.charAt(0) == 'U' ? 1 : 0; int lineNum = Integer.parseInt(text.substring(offset)); return new LineNo(lineNum); From 9fa84a4497739055e7224dd21fc40b3ec1be58e4 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 23 Jul 2024 14:50:25 +0900 Subject: [PATCH 26/94] iterate over wordIds in the compiled dict --- .../sudachi/dictionary/DictionaryPrinter.java | 30 ++++++++++++++++++- .../dictionary/DoubleArrayLexicon.java | 2 +- .../worksap/nlp/sudachi/dictionary/Ints.java | 17 +++++++++-- .../nlp/sudachi/dictionary/Lexicon.java | 7 +++++ .../nlp/sudachi/dictionary/LexiconSet.java | 5 ++++ .../nlp/sudachi/dictionary/WordIdTable.java | 3 +- .../com/worksap/nlp/sudachi/ConfigTest.kt | 1 - 7 files changed, 59 insertions(+), 6 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index 5d7fdc60..c58008f9 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -56,7 +56,7 @@ private DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictio // iterator over them will get them not in the sorted order, but grouped by // surface (and sorted in groups) Ints allIds = new Ints(lex.size()); - Iterator ids = lex.wordIds(); + Iterator ids = lex.wordIds(0); while (ids.hasNext()) { allIds.appendAll(ids.next()); } @@ -64,6 +64,19 @@ private DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictio wordIds = allIds; } + /** print information in the dictionary Description part */ + void printDescription(BinaryDictionary dic) { + PrintStream out = System.err; + Description desc = dic.getDictionaryHeader(); + out.printf("creation time: %s%n", desc.getCreationTime()); + out.printf("comment: %s%n", desc.getComment()); + out.printf("reference: %s%n", desc.getReference()); + for (Description.Block b : desc.getBlocks()) { + long start = b.getStart(); + out.printf("Block %s: %d - %d%n", b.getName(), start, start + b.getSize()); + } + } + void printHeader() { // @formatter:off printColumnHeaders(Column.Surface, Column.LeftId, Column.RightId, Column.Cost, Column.Pos1, Column.Pos2, @@ -74,7 +87,13 @@ void printHeader() { } void printColumnHeaders(Column... headers) { + boolean isFirst = true; for (Column c : headers) { + if (isFirst) { + isFirst = false; + } else { + output.print(","); + } output.print(c.name()); } output.println(); @@ -102,6 +121,14 @@ void printEntry(int wordId) { field(reading); entryPtr(info.getNormalizedForm(), ","); entryPtr(info.getDictionaryForm(), ","); + // TODO: + field(""); // mode + field(""); // C split + field(""); // B split + field(""); // A split + field(""); // Word structure + field(""); // sysnonym groups + field(""); // user data output.print("\n"); } @@ -184,6 +211,7 @@ private void printEntries() { static void printDictionary(String filename, BinaryDictionary systemDict, PrintStream output) throws IOException { try (BinaryDictionary dictionary = new BinaryDictionary(filename)) { DictionaryPrinter dp = new DictionaryPrinter(output, dictionary, systemDict); + dp.printDescription(dictionary); dp.printHeader(); dp.printEntries(); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java index 35e37ce2..4f9731cf 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java @@ -148,7 +148,7 @@ public int size() { return (int) description.getNumTotalEntries(); } - public Iterator wordIds() { + public Iterator wordIds(int dic) { return wordIdTable.wordIds(); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java index f441c46a..f4a82c9f 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java @@ -54,6 +54,10 @@ public int length() { return length; } + public int capacity() { + return data.length; + } + public void append(int value) { maybeResize(1); int idx = this.length; @@ -68,7 +72,7 @@ public void clear() { private int[] maybeResize(int additional) { int newSize = length + additional; int[] d = data; - if (newSize > d.length) { + if (newSize > capacity()) { d = Arrays.copyOf(data, Math.max(newSize, length * 2)); data = d; } @@ -114,8 +118,17 @@ public String toString() { return joiner.toString(); } + /** + * Make sure the internal buffer has enough capacity for the specified size. + * This also increases length and they should be filled using {@code set} or + * returned array. + * + * @return internal int array + */ public int[] prepare(int size) { - return maybeResize(length - size); + int[] d = maybeResize(size); + this.length += size; + return d; } public void appendAll(Ints other) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java index edc60c52..a4a74fb7 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java @@ -69,5 +69,12 @@ public interface Lexicon { */ String string(int dic, int stringPtr); + /** @return WordInfoList of the specified dictionary */ WordInfoList wordInfos(int dic); + + /** + * Iterates over all word ids in the specified dictionary. Returned word ids are + * not sorted. + */ + Iterator wordIds(int dic); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java index c6a091f0..090fb709 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java @@ -166,4 +166,9 @@ public String string(int dic, int stringPtr) { public WordInfoList wordInfos(int dic) { return lexicons.get(dic).wordInfos(dic); } + + @Override + public Iterator wordIds(int dic) { + return lexicons.get(dic).wordIds(dic); + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java index e166755c..e4fa18a2 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java @@ -88,7 +88,7 @@ void setDictionaryId(int id) { */ public Iterator wordIds() { return new Iterator() { - private final BufReader buf = new BufReader(bytes.duplicate()); + private final BufReader buf = new BufReader((ByteBuffer) bytes.duplicate().position(0)); private final Ints ints = new Ints(16); @Override @@ -100,6 +100,7 @@ public boolean hasNext() { public Ints next() { BufReader r = buf; int size = r.readVarint32(); + ints.clear(); int[] data = ints.prepare(size); readDeltaCompressed(data, size, dicIdMask, r); return ints; diff --git a/src/test/java/com/worksap/nlp/sudachi/ConfigTest.kt b/src/test/java/com/worksap/nlp/sudachi/ConfigTest.kt index 10f5517e..df00b6e5 100644 --- a/src/test/java/com/worksap/nlp/sudachi/ConfigTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/ConfigTest.kt @@ -17,7 +17,6 @@ package com.worksap.nlp.sudachi import com.worksap.nlp.sudachi.Config.Resource -import com.worksap.nlp.sudachi.dictionary.build.res import java.io.FileNotFoundException import java.net.URL import java.nio.file.Path From aaf5c3716ba83828c7cc5afcf9f05ce7d01b7ce0 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 23 Jul 2024 14:52:32 +0900 Subject: [PATCH 27/94] build user dict (load system dict, resolve wordref, save user data only, progress) --- .../dictionary/UserDictionaryBuilder.java | 5 +- .../dictionary/build/CompiledWordEntry.java | 64 ++++++++++++++++++ .../sudachi/dictionary/build/DicBuilder.java | 67 +++++++++++++++---- .../nlp/sudachi/dictionary/build/Index.java | 2 +- .../nlp/sudachi/dictionary/build/Lookup2.java | 19 ++++-- .../sudachi/dictionary/build/POSTable.java | 8 +-- .../sudachi/dictionary/build/RawLexicon.java | 43 +++++++++++- .../nlp/sudachi/dictionary/build/WordRef.java | 13 ++-- .../com/worksap/nlp/sudachi/TestDictionary.kt | 4 +- .../sudachi/dictionary/build/UserDicTest.kt | 2 +- 10 files changed, 189 insertions(+), 38 deletions(-) create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/CompiledWordEntry.java diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilder.java index e1280163..83d0cc0b 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilder.java @@ -88,8 +88,9 @@ public static void main(String[] args) throws IOException { List lexiconPaths = Arrays.asList(args).subList(i, args.length); try (BinaryDictionary system = new BinaryDictionary(sysDictPath)) { - DicBuilder.User builder = DicBuilder.user(system).comment(description) - .progress(new Progress(20, new DictionaryBuilder.StderrProgress())); + DicBuilder.User builder = DicBuilder.user() + .progress(new Progress(20, new DictionaryBuilder.StderrProgress())).system(system) + .comment(description); for (String lexicon : lexiconPaths) { builder.lexicon(Paths.get(lexicon)); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CompiledWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CompiledWordEntry.java new file mode 100644 index 00000000..dafb62a9 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CompiledWordEntry.java @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2024 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build; + +import java.util.Objects; + +import com.worksap.nlp.sudachi.dictionary.Lexicon; +import com.worksap.nlp.sudachi.dictionary.WordInfo; + +/** + * WordInfo wrapper for Lookup2.Entry interface. + * + * Used to resolve wordref that references entry in the system dictionary + * (during user dictinary build). + */ +public class CompiledWordEntry implements Lookup2.Entry { + private final Lexicon lexicon; + private final int wordId; + private WordInfo wiCache = null; + + public CompiledWordEntry(Lexicon lexicon, int wordId) { + this.lexicon = lexicon; + this.wordId = wordId; + } + + private WordInfo wordInfo() { + if (wiCache != null) { + return wiCache; + } + wiCache = lexicon.getWordInfo(wordId); + return wiCache; + } + + @Override + public int pointer() { + return wordId; + } + + @Override + public boolean matches(short posId, String reading) { + WordInfo wi = wordInfo(); + return (posId == wi.getPOSId()) && Objects.equals(reading, lexicon.string(0, wi.getReadingForm())); + } + + @Override + public String headword() { + WordInfo wi = wordInfo(); + return lexicon.string(0, wi.getSurface()); + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java index 89fca853..44ee9543 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java @@ -211,7 +211,7 @@ public System signature(String signature) { /** * Typestate pattern for system dictionary that does not have connection matrix - * added yet + * added yet. */ public static final class SystemNoMatrix { private final System inner; @@ -286,36 +286,77 @@ public SystemNoMatrix progress(Progress progress) { } /** - * User dictionary. + * Create a new system dictionary compiler + * + * @return new dictionary compiler object + */ + public static SystemNoMatrix system() { + return new SystemNoMatrix(new System()); + } + + /** + * User dictionary with reference system dictionary added. * - * Requires system dictionary to load grammar from to initialize. + * Instanciate via UserNoSystem. */ public static final class User extends Base { - private User(DictionaryAccess system) { + public User system(DictionaryAccess system) { + progress.startBlock("system dict entries", nanoTime(), Progress.Kind.ENTRY); + int nread = lexicon.preloadFrom(system.getLexicon(), progress); + progress.endBlock(nread, nanoTime()); + + progress.startBlock("system dict pos list", nanoTime(), Progress.Kind.ENTRY); pos.preloadFrom(system.getGrammar()); + progress.endBlock(pos.getList().size(), nanoTime()); + description.setSignature(""); + return this; } } /** - * Create a new system dictionary compiler - * - * @return new dictionary compiler object + * Typestate pattern for user dictionary that does not have system dictionary + * added yet. */ - public static SystemNoMatrix system() { - return new SystemNoMatrix(new System()); + public static final class UserNoSystem { + private final User inner; + + private UserNoSystem(DicBuilder.User inner) { + this.inner = inner; + } + + /** + * Preload data from given system dictionary. + * + * @param system + * referenced dictionary + * @return + */ + public DicBuilder.User system(DictionaryAccess system) { + return inner.system(system); + } + + /** + * Set the progress handler to the provided one + * + * @param progress + * handler + * @return current object + */ + public UserNoSystem progress(Progress progress) { + inner.progress(progress); + return this; + } } /** * Create a new user dictionary compiler which will reference the provided user * dictionary. * - * @param system - * referenced dictionary * @return new dictionary compiler object */ - public static User user(DictionaryAccess system) { - return new User(system); + public static UserNoSystem user() { + return new UserNoSystem(new User()); } /** entry point to test Base build with single lexicon (first arg). */ diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java index 09f7f16b..2491ca91 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java @@ -161,9 +161,9 @@ private TrieData writeWordTable(BlockOutput out, List n prevId = wid; p.progress(++i, fullsize); } + buffer.flush(); return null; }); - buffer.flush(); return new TrieData(keys, values); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java index 9fa1532a..4a6a44ff 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java @@ -33,8 +33,16 @@ public interface Entry { String headword(); } - public Lookup2(List entries) { + private final List entries; + // number of reference system dictionary entries. only used to resolve user + // line-no ref. + private final int nbuiltin; + // mapping to entries that have same surfaces + private final Map> bySurface; + + public Lookup2(List entries, int nbuiltin) { this.entries = entries; + this.nbuiltin = nbuiltin; HashMap> result = new HashMap<>(entries.size() * 4 / 3); for (Entry e : entries) { List sublist = result.computeIfAbsent(e.headword(), x -> new ArrayList<>()); @@ -43,10 +51,6 @@ public Lookup2(List entries) { bySurface = result; } - private final List entries; - // mapping to entries that have same surfaces - private final Map> bySurface; - /** * Lookup an entry by the list index. Make sure you know the order of entries in * the list. @@ -54,8 +58,9 @@ public Lookup2(List entries) { * @param index * @return */ - public Entry byIndex(int index) { - return entries.get(index); + public Entry byIndex(int index, boolean isUser) { + int offset = isUser ? nbuiltin : 0; + return entries.get(index + offset); } /** diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java index 0292e8f7..e74a11c6 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java @@ -105,15 +105,15 @@ public int ownedLength() { public Void compile(BlockOutput out) throws IOException { return out.measured("POS Table", (p) -> { BufferedChannel cbuf = new BufferedChannel(out.getChannel()); - cbuf.byteBuffer(2).putShort((short) table.size()); - for (int i = 0; i < table.size(); ++i) { + cbuf.byteBuffer(2).putShort((short) ownedLength()); + for (int i = 0; i < ownedLength(); ++i) { BufWriter writer = cbuf.writer(POS.MAX_BINARY_LENGTH); - POS pos = table.get(i); + POS pos = table.get(builtin + i); for (String s : pos) { // strings are always shorter than POS.MAX writer.putShortString(s); } - p.progress(i, table.size()); + p.progress(i, ownedLength()); } cbuf.flush(); return null; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index 841bd1a6..022bbe92 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -19,6 +19,8 @@ import com.worksap.nlp.sudachi.dictionary.Blocks; import com.worksap.nlp.sudachi.dictionary.CSVParser; import com.worksap.nlp.sudachi.dictionary.DoubleArrayLexicon; +import com.worksap.nlp.sudachi.dictionary.Ints; +import com.worksap.nlp.sudachi.dictionary.Lexicon; import com.worksap.nlp.sudachi.dictionary.WordInfoList; import java.io.IOException; @@ -27,7 +29,10 @@ import java.io.Reader; import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.Iterator; import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; /** * Dictionary part: Lexicon loaded from csv files. @@ -39,17 +44,49 @@ public class RawLexicon { private static final long MAX_OFFSET = (long) Integer.MAX_VALUE * WordInfoList.OFFSET_ALIGNMENT; // put empty entry at the first private static final int INITIAL_OFFSET = 32; - private final StringStorage strings = new StringStorage(); + + // full list of word entries, in the order in csv. private final List entries = new ArrayList<>(); - private final List notIndexed = new ArrayList<>(); private final Index index = new Index(); + private final List notIndexed = new ArrayList<>(); + private final StringStorage strings = new StringStorage(); private boolean user = false; // offset for next entry private long offset = INITIAL_OFFSET; private boolean runtimeCosts = false; + // entries loaded from the referencing system dictionary (for user + // dict build). + private final List preloadedEntries = new ArrayList<>(); + + /** + * Preload entries from the lexicon (of the system dictionary). They are only + * used to resolve wordref. + * + * @param lexicon + * @return number of entries read. + */ + public int preloadFrom(Lexicon lexicon, Progress progress) { + Ints allIds = new Ints(lexicon.size()); + Iterator ids = lexicon.wordIds(0); + while (ids.hasNext()) { + allIds.appendAll(ids.next()); + } + allIds.sort(); + for (int i = 0; i < allIds.length(); i++) { + preloadedEntries.add(new CompiledWordEntry(lexicon, allIds.get(i))); + progress.progress(i, allIds.length()); + } + return preloadedEntries.size(); + } + + /** Full list of entries in referencing system and target lexicon. */ + private List lookupEntries() { + return Stream.concat(preloadedEntries.stream(), entries.stream()).collect(Collectors.toList()); + } + /** * Read lexicon from InputStream. * @@ -130,7 +167,7 @@ public void compile(POSTable pos, BlockLayout layout) throws IOException { private Void writeEntries(POSTable pos, BlockOutput blockOutput) throws IOException { return blockOutput.measured("Word Entries", (p) -> { List list = entries; - Lookup2 lookup = new Lookup2(list); + Lookup2 lookup = new Lookup2(lookupEntries(), preloadedEntries.size()); WordRef.Parser refParser = WordRef.parser(pos, !user, false, false); BufferedChannel buf = new BufferedChannel(blockOutput.getChannel(), WordEntryLayout.MAX_LENGTH * 4); buf.position(INITIAL_OFFSET); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java index 04d5289e..82025b62 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java @@ -36,9 +36,11 @@ public abstract class WordRef { */ public static final class LineNo extends WordRef { private final int line; + private final boolean isUser; - public LineNo(int line) { + public LineNo(int line, boolean isUser) { this.line = line; + this.isUser = isUser; } public int getLine() { @@ -47,12 +49,12 @@ public int getLine() { @Override public int resolve(Lookup2 resolver) { - return resolver.byIndex(line).pointer(); + return resolver.byIndex(line, isUser).pointer(); } @Override public String toString() { - return String.format("WordRef/Line: %d", line); + return String.format("WordRef/Line: %s%d", isUser ? "U" : "S", line); } } @@ -154,9 +156,10 @@ public WordRef parse(String text) { } if (allowNumeric && NUMERIC_RE.matcher(text).matches()) { - int offset = text.charAt(0) == 'U' ? 1 : 0; + boolean isUser = text.charAt(0) == 'U'; + int offset = isUser ? 1 : 0; int lineNum = Integer.parseInt(text.substring(offset)); - return new LineNo(lineNum); + return new LineNo(lineNum, isUser); } if (StringUtil.count(text, ',') == 7) { diff --git a/src/test/java/com/worksap/nlp/sudachi/TestDictionary.kt b/src/test/java/com/worksap/nlp/sudachi/TestDictionary.kt index 4ff41355..1465aa20 100644 --- a/src/test/java/com/worksap/nlp/sudachi/TestDictionary.kt +++ b/src/test/java/com/worksap/nlp/sudachi/TestDictionary.kt @@ -34,7 +34,7 @@ object TestDictionary { val userDict1Data: MemChannel by lazy { val chan = MemChannel() - DicBuilder.user(systemDict).lexicon(res("/dict/user.csv")).build(chan) + DicBuilder.user().system(systemDict).lexicon(res("/dict/user.csv")).build(chan) chan } @@ -46,7 +46,7 @@ object TestDictionary { val userDict2: BinaryDictionary by lazy { val chan = MemChannel() - DicBuilder.user(systemDict).lexicon(res("/dict/user2.csv")).build(chan) + DicBuilder.user().system(systemDict).lexicon(res("/dict/user2.csv")).build(chan) BinaryDictionary.loadUser(chan.buffer()) } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt index 8cb557c2..a9c53e9d 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt @@ -45,7 +45,7 @@ class TestDic { } fun user(data: String): TestDic { - val bldr = DicBuilder.user(systemDic).lexicon(data) + val bldr = DicBuilder.user().system(systemDic).lexicon(data) val ch = MemChannel() bldr.build(ch) this.userDics.add(BinaryDictionary(ch.buffer())) From edad56f7adf716bd86736e77cfa99d4a5c177974 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 23 Jul 2024 14:53:56 +0900 Subject: [PATCH 28/94] allow missing connection matrix (user dict) --- .../com/worksap/nlp/sudachi/dictionary/GrammarImpl.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/GrammarImpl.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/GrammarImpl.java index dc44cd06..58fb04f5 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/GrammarImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/GrammarImpl.java @@ -78,7 +78,11 @@ public GrammarImpl(List posList, Connection matrix) { } public static GrammarImpl load(ByteBuffer binaryDic, Description header) { - Connection matrix = Connection.fromByteBufferV1(header.slice(binaryDic, Blocks.CONNECTION_MATRIX)); + ByteBuffer connmatBytes = header.sliceOrNull(binaryDic, Blocks.CONNECTION_MATRIX); + Connection matrix = null; + if (connmatBytes != null) { + matrix = Connection.fromByteBufferV1(connmatBytes); + } List posList = loadPosList(header.slice(binaryDic, Blocks.POS_TABLE)); return new GrammarImpl(posList, matrix); } From b24e37be99b8b2674233207107510e56f907934b Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 23 Jul 2024 15:02:24 +0900 Subject: [PATCH 29/94] split align position method --- .../sudachi/dictionary/build/BlockLayout.java | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java index eb790825..a91893bf 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java @@ -40,6 +40,19 @@ public BlockLayout(SeekableByteChannel channel, Progress progress) throws IOExce channel.position(BLOCK_SIZE); // keep first block for the description } + /** + * Align the current position of output channel. + * + * @return new position of channel + */ + private long alignPosition() throws IOException { + SeekableByteChannel chan = channel; + long end = chan.position(); + long newPosition = Align.align(end, BLOCK_SIZE); + chan.position(newPosition); + return newPosition; + } + /** Function that works with BlockOutput */ public interface BlockHandler { T apply(BlockOutput output) throws IOException; @@ -59,11 +72,9 @@ public interface BlockHandler { */ public T block(String name, BlockHandler handler) throws IOException { SeekableByteChannel chan = channel; - long start = chan.position(); + long start = alignPosition(); T result = handler.apply(new BlockOutput(chan, progress)); long end = chan.position(); - long newPosition = Align.align(end, BLOCK_SIZE); - chan.position(newPosition); info.add(new BlockInfo(name, start, end)); return result; } From c63784d8512f104e6db5be8970fd35e334c74c40 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 23 Jul 2024 16:57:11 +0900 Subject: [PATCH 30/94] print dictionary description --- .../nlp/sudachi/dictionary/Description.java | 12 ++++++---- .../dictionary/DictionaryHeaderPrinter.java | 24 ++++++++++++++++++- .../sudachi/dictionary/DictionaryPrinter.java | 14 ----------- .../dictionary/DoubleArrayLexicon.java | 2 +- 4 files changed, 32 insertions(+), 20 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java index 5c3d76a1..6c478ff5 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java @@ -103,10 +103,6 @@ public boolean isUserDictionary() { return !reference.isEmpty(); } - public long getNumTotalEntries() { - return numTotalEntries; - } - public static class Block { private final String name; private final long start; @@ -283,6 +279,14 @@ public boolean isRuntimeCosts() { return (flags & 0x1L) != 0; } + public int getNumTotalEntries() { + return numTotalEntries; + } + + public int getNumIndexedEntries() { + return this.numIndexedEntries; + } + public void setNumberOfEntries(int indexed, int total) { this.numIndexedEntries = indexed; this.numTotalEntries = total; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinter.java index 306491a1..25c95a8a 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinter.java @@ -33,6 +33,28 @@ public class DictionaryHeaderPrinter { private DictionaryHeaderPrinter() { } + /** print information in the dictionary Description part */ + static void printDescription(String filename, PrintStream output) throws IOException { + ByteBuffer bytes; + try (FileInputStream input = new FileInputStream(filename); FileChannel inputFile = input.getChannel()) { + bytes = inputFile.map(FileChannel.MapMode.READ_ONLY, 0, inputFile.size()); + bytes.order(ByteOrder.LITTLE_ENDIAN); + } + Description desc = Description.load(bytes); + + output.printf("Creation time: %s%n", desc.getCreationTime()); + output.printf("Comment: %s%n", desc.getComment()); + output.printf("Signature: %s%n", desc.getSignature()); + output.printf("Reference: %s%n", desc.getReference()); + output.printf("Entries total: %d%n", desc.getNumTotalEntries()); + output.printf("Entries indexed: %d%n", desc.getNumIndexedEntries()); + for (Description.Block b : desc.getBlocks()) { + long start = b.getStart(); + output.printf("Block %s: %d - %d%n", b.getName(), start, start + b.getSize()); + } + output.printf("Flag isRuntimeCosts: %s%n", desc.isRuntimeCosts()); + } + static void printHeader(String filename, PrintStream output) throws IOException { ByteBuffer bytes; try (FileInputStream input = new FileInputStream(filename); FileChannel inputFile = input.getChannel()) { @@ -69,7 +91,7 @@ static void printHeader(String filename, PrintStream output) throws IOException */ public static void main(String[] args) throws IOException { for (String filename : args) { - printHeader(filename, System.out); + printDescription(filename, System.out); } } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index c58008f9..58c4347f 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -64,19 +64,6 @@ private DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictio wordIds = allIds; } - /** print information in the dictionary Description part */ - void printDescription(BinaryDictionary dic) { - PrintStream out = System.err; - Description desc = dic.getDictionaryHeader(); - out.printf("creation time: %s%n", desc.getCreationTime()); - out.printf("comment: %s%n", desc.getComment()); - out.printf("reference: %s%n", desc.getReference()); - for (Description.Block b : desc.getBlocks()) { - long start = b.getStart(); - out.printf("Block %s: %d - %d%n", b.getName(), start, start + b.getSize()); - } - } - void printHeader() { // @formatter:off printColumnHeaders(Column.Surface, Column.LeftId, Column.RightId, Column.Cost, Column.Pos1, Column.Pos2, @@ -211,7 +198,6 @@ private void printEntries() { static void printDictionary(String filename, BinaryDictionary systemDict, PrintStream output) throws IOException { try (BinaryDictionary dictionary = new BinaryDictionary(filename)) { DictionaryPrinter dp = new DictionaryPrinter(output, dictionary, systemDict); - dp.printDescription(dictionary); dp.printHeader(); dp.printEntries(); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java index 4f9731cf..58070c6b 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java @@ -145,7 +145,7 @@ public WordInfo getWordInfo(int wordId) { @Override public int size() { - return (int) description.getNumTotalEntries(); + return description.getNumTotalEntries(); } public Iterator wordIds(int dic) { From 89749a4fe8a5e3560668d82800d846912b1f8096 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 23 Jul 2024 17:31:31 +0900 Subject: [PATCH 31/94] fill user.desc.ref using system.desc.signature --- .../worksap/nlp/sudachi/dictionary/build/DicBuilder.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java index 44ee9543..abfc0175 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java @@ -16,9 +16,9 @@ package com.worksap.nlp.sudachi.dictionary.build; +import com.worksap.nlp.sudachi.dictionary.BinaryDictionary; import com.worksap.nlp.sudachi.dictionary.Blocks; import com.worksap.nlp.sudachi.dictionary.Description; -import com.worksap.nlp.sudachi.dictionary.DictionaryAccess; import java.io.IOException; import java.io.InputStream; @@ -300,7 +300,7 @@ public static SystemNoMatrix system() { * Instanciate via UserNoSystem. */ public static final class User extends Base { - public User system(DictionaryAccess system) { + public User system(BinaryDictionary system) { progress.startBlock("system dict entries", nanoTime(), Progress.Kind.ENTRY); int nread = lexicon.preloadFrom(system.getLexicon(), progress); progress.endBlock(nread, nanoTime()); @@ -310,6 +310,7 @@ public User system(DictionaryAccess system) { progress.endBlock(pos.getList().size(), nanoTime()); description.setSignature(""); + description.setReference(system.getDictionaryHeader().getSignature()); return this; } } @@ -332,7 +333,7 @@ private UserNoSystem(DicBuilder.User inner) { * referenced dictionary * @return */ - public DicBuilder.User system(DictionaryAccess system) { + public DicBuilder.User system(BinaryDictionary system) { return inner.system(system); } From 620a3e3f43c7e7f8168ac1158701cd7530c88d1c Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 24 Jul 2024 14:20:29 +0900 Subject: [PATCH 32/94] fix nodeimpl utilizing special/oov wordId --- .../nlp/sudachi/JapaneseTokenizer.java | 5 + .../java/com/worksap/nlp/sudachi/Lattice.java | 5 +- .../com/worksap/nlp/sudachi/LatticeImpl.java | 8 +- .../com/worksap/nlp/sudachi/LatticeNode.java | 8 +- .../worksap/nlp/sudachi/LatticeNodeImpl.java | 147 ++++++++++-------- .../com/worksap/nlp/sudachi/MorphemeImpl.java | 2 +- .../nlp/sudachi/OovProviderPlugin.java | 3 +- .../java/com/worksap/nlp/sudachi/WordId.java | 62 ++++++-- .../com/worksap/nlp/sudachi/WordMask.java | 3 +- 9 files changed, 148 insertions(+), 95 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java index d9909314..fcb21822 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java @@ -250,6 +250,11 @@ LatticeImpl buildLattice(UTF8InputText input) { return lattice; } + /** + * Create OOV nodes using plugin and add them to the lattice and unkNodes. + * + * @return wordMask updated based on created OOV nodes. + */ private long provideOovs(OovProviderPlugin plugin, UTF8InputText input, ArrayList unkNodes, int boundary, long wordMask) { int initialSize = unkNodes.size(); diff --git a/src/main/java/com/worksap/nlp/sudachi/Lattice.java b/src/main/java/com/worksap/nlp/sudachi/Lattice.java index 479c3c65..b918cf3c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Lattice.java +++ b/src/main/java/com/worksap/nlp/sudachi/Lattice.java @@ -17,7 +17,6 @@ package com.worksap.nlp.sudachi; import java.util.List; -import java.util.Optional; /** * A graph structure used in the morphological analysis. @@ -113,6 +112,10 @@ public interface Lattice { * Allocate a new node. * * @return a new node has no information + * + * @deprecated use {@code LatticeNodeImpl.makeOOV} or + * {@code LatticeNodeImpl.OOVFactory} instead. */ + @Deprecated public LatticeNode createNode(); } diff --git a/src/main/java/com/worksap/nlp/sudachi/LatticeImpl.java b/src/main/java/com/worksap/nlp/sudachi/LatticeImpl.java index 13d5c786..3980e7f2 100644 --- a/src/main/java/com/worksap/nlp/sudachi/LatticeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/LatticeImpl.java @@ -45,7 +45,7 @@ class LatticeImpl implements Lattice { eosParams = grammar.getEOSParameter(); endLists = new ArrayList<>(); - LatticeNodeImpl bosNode = new LatticeNodeImpl(); + LatticeNodeImpl bosNode = LatticeNodeImpl.makeSpecial(WordId.ID_BOS); bosNode.bestPreviousNode = bosNode; short[] bosParams = grammar.getBOSParameter(); bosNode.setParameter(bosParams[0], bosParams[1], bosParams[2]); @@ -62,7 +62,7 @@ void resize(int size) { } this.size = size; - eosNode = new LatticeNodeImpl(); + eosNode = LatticeNodeImpl.makeSpecial(WordId.ID_EOS); eosNode.setParameter(eosParams[0], eosParams[1], eosParams[2]); eosNode.begin = eosNode.end = size; } @@ -187,11 +187,11 @@ List getBestPath() { } String getSurface(LatticeNodeImpl node) { - return (node.isDefined) ? node.getBaseSurface() : "(null)"; + return node.isSpecial() ? "(null)" : node.getBaseSurface(); } String getPos(LatticeNodeImpl node) { - if (!node.isDefined) { + if (node.isSpecial()) { return "BOS/EOS"; } else { WordInfo wi = node.getWordInfo(); diff --git a/src/main/java/com/worksap/nlp/sudachi/LatticeNode.java b/src/main/java/com/worksap/nlp/sudachi/LatticeNode.java index 69de4bc3..85f0e5a7 100644 --- a/src/main/java/com/worksap/nlp/sudachi/LatticeNode.java +++ b/src/main/java/com/worksap/nlp/sudachi/LatticeNode.java @@ -26,8 +26,8 @@ * and the information of morpheme as {@link WordInfo} * *

- * Allocation of a node in the plugins must be done through - * {@link Lattice#createNode}. + * Node should be creadted via {@link LatticeNodeImpl#makeOov} or + * {@link LatticeNodeImpl.OOVFactory} in the plugins. * * @see Lattice * @see WordInfo @@ -83,7 +83,11 @@ public interface LatticeNode { /** * Makes the node out of vocabulary. + * + * @deprecated OOV node should be created via {@link LatticeNodeImpl#makeOov} or + * {@link LatticeNodeImpl.OOVFactory}. */ + @Deprecated public void setOOV(); /** diff --git a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java index e4217d07..fcc5dea9 100644 --- a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java @@ -32,19 +32,18 @@ public class LatticeNodeImpl implements LatticeNode { short cost; int wordId; + // word info that corresponds to wordId or that manually set (OOV). + WordInfo wordInfo; + // for lattice construction int totalCost; LatticeNodeImpl bestPreviousNode; - boolean isDefined; - boolean isOOV; - WordInfo extraWordInfo; - - // this is either Lexicon or StringsCache object + // either Lexicon or StringsCache object Object lexicon; - private static final short ZERO = (short) 0; - static final WordInfo UNDEFINED_WORDINFO = new WordInfo(ZERO, ZERO); + // Empty wordInfo for special words. + static final WordInfo UNDEFINED_WORDINFO = new WordInfo((short) 0, (short) -1); LatticeNodeImpl(Lexicon lexicon, long params, int wordId) { this.lexicon = lexicon; @@ -52,11 +51,30 @@ public class LatticeNodeImpl implements LatticeNode { this.rightId = WordParameters.rightId(params); this.cost = WordParameters.cost(params); this.wordId = wordId; - this.isDefined = true; } + /** Create empty node. Caller must fill fields. */ LatticeNodeImpl() { - isDefined = false; + } + + /** Create special node with given wordid. */ + static LatticeNodeImpl makeSpecial(int specialWordId) { + assert WordId.isSpecial(specialWordId); + LatticeNodeImpl node = new LatticeNodeImpl(); + node.wordId = specialWordId; + return node; + } + + /** Create OOV node. */ + public static LatticeNodeImpl makeOov(int begin, int end, short posId, String surface, String normalizedForm, + String dictionaryForm, String readingForm) { + LatticeNodeImpl node = new LatticeNodeImpl(); + node.wordId = WordId.makeOov(posId); + node.wordInfo = new WordInfo((short) (end - begin), posId); + node.lexicon = new StringsCache(surface, readingForm, normalizedForm, dictionaryForm); + node.begin = begin; + node.end = end; + return node; } @Override @@ -66,6 +84,12 @@ public void setParameter(short leftId, short rightId, short cost) { this.cost = cost; } + public void setParameter(long params) { + this.leftId = WordParameters.leftId(params); + this.rightId = WordParameters.rightId(params); + this.cost = WordParameters.cost(params); + } + private Lexicon lexicon() { if (lexicon instanceof Lexicon) { return (Lexicon) lexicon; @@ -99,26 +123,34 @@ public boolean isOOV() { @Override public void setOOV() { - isOOV = true; + this.wordId = WordId.ID_OOV_NOPOS; + } + + /** @return if this node is a special node. */ + public boolean isSpecial() { + return WordId.isSpecial(wordId); + } + + /** @return if this node comes from a dictionary. */ + public boolean isDefined() { + return !isOOV() && !isSpecial(); } @Override public WordInfo getWordInfo() { - if (!isDefined) { + if (isSpecial()) { return UNDEFINED_WORDINFO; } - if (extraWordInfo != null) { - return extraWordInfo; + if (wordInfo != null) { + return wordInfo; } - WordInfo info = lexicon().getWordInfo(wordId); - extraWordInfo = info; - return info; + wordInfo = lexicon().getWordInfo(wordId); + return wordInfo; } @Override public void setWordInfo(WordInfo wordInfo) { - extraWordInfo = wordInfo; - isDefined = true; + this.wordInfo = wordInfo; } @Override @@ -133,10 +165,10 @@ public int getWordId() { @Override public int getDictionaryId() { - if (!isDefined || extraWordInfo != null) { - return -1; + if (isDefined()) { + return WordId.dic(wordId); } - return WordId.dic(wordId); + return -1; } public boolean isConnectedToBOS() { @@ -224,13 +256,21 @@ private static final class StringsCache { private final Lexicon lexicon; private String surface; private String reading; - private String dictionaryForm; private String normalizedForm; + private String dictionaryForm; public StringsCache(Lexicon lexicon) { this.lexicon = lexicon; } + public StringsCache(String surface, String readingForm, String normalizedForm, String dictionaryForm) { + this.lexicon = null; + this.surface = surface; + this.reading = readingForm; + this.normalizedForm = normalizedForm; + this.dictionaryForm = dictionaryForm; + } + public String getSurface(LatticeNodeImpl node) { // benign data race pattern // https://shipilev.net/blog/2016/close-encounters-of-jmm-kind/#wishful-benign-is-resilient @@ -257,28 +297,28 @@ public String getReading(LatticeNodeImpl node) { return s; } - public String getDictionaryForm(LatticeNodeImpl node) { - String s = dictionaryForm; + public String getNormalizedForm(LatticeNodeImpl node) { + String s = normalizedForm; if (s == null) { WordInfo wi = node.getWordInfo(); - int dicEntryPtr = wi.getDictionaryForm(); + int dicEntryPtr = wi.getNormalizedForm(); int dic = WordId.blendDic(dicEntryPtr, WordId.dic(node.wordId)); int surface = lexicon.wordInfos(dic).surfacePtr(dicEntryPtr); s = lexicon.string(dic, surface); - dictionaryForm = s; + normalizedForm = s; } return s; } - public String getNormalizedForm(LatticeNodeImpl node) { - String s = normalizedForm; + public String getDictionaryForm(LatticeNodeImpl node) { + String s = dictionaryForm; if (s == null) { WordInfo wi = node.getWordInfo(); - int dicEntryPtr = wi.getNormalizedForm(); + int dicEntryPtr = wi.getDictionaryForm(); int dic = WordId.blendDic(dicEntryPtr, WordId.dic(node.wordId)); int surface = lexicon.wordInfos(dic).surfacePtr(dicEntryPtr); s = lexicon.string(dic, surface); - normalizedForm = s; + dictionaryForm = s; } return s; } @@ -288,57 +328,27 @@ public static OOVFactory oovFactory(short leftId, short rightId, short cost, sho return new OOVFactory(leftId, rightId, cost, posId); } - public static LatticeNodeImpl makeOov(int begin, int end, short posId, String surface, String normalizedForm, - String dictionaryForm, String readingForm) { - StringsCache c = new StringsCache(null); - c.surface = surface; - c.normalizedForm = normalizedForm; - c.reading = readingForm; - c.dictionaryForm = dictionaryForm; - WordInfo wi = new WordInfo(Short.MIN_VALUE, posId); - LatticeNodeImpl node = new LatticeNodeImpl(); - node.extraWordInfo = wi; - node.lexicon = c; - node.begin = begin; - node.end = end; - return node; - } - public static final class OOVFactory { private final short leftId; private final short rightId; private final short cost; private final short posId; - private final WordInfo wordInfo; private OOVFactory(short leftId, short rightId, short cost, short posId) { this.rightId = rightId; this.cost = cost; this.leftId = leftId; this.posId = posId; - this.wordInfo = new WordInfo(ZERO, posId); } - public LatticeNodeImpl make(int start, int end, InputText input) { - String s = input.getSubstring(start, end); - return make(start, end, s); + public LatticeNodeImpl make(int begin, int end, InputText input) { + String s = input.getSubstring(begin, end); + return make(begin, end, s); } - public LatticeNodeImpl make(int start, int end, String text) { - LatticeNodeImpl i = new LatticeNodeImpl(); - i.begin = start; - i.end = end; - i.leftId = leftId; - i.rightId = rightId; - i.cost = cost; - i.wordId = WordId.oovWid(posId); - i.extraWordInfo = wordInfo; - StringsCache sc = new StringsCache(null); - sc.surface = text; - sc.reading = text; - sc.dictionaryForm = text; - sc.normalizedForm = text; - i.lexicon = sc; + public LatticeNodeImpl make(int begin, int end, String text) { + LatticeNodeImpl i = makeOov(begin, end, posId, text, text, text, text); + i.setParameter(leftId, rightId, cost); return i; } @@ -349,13 +359,12 @@ public boolean equals(Object o) { if (o == null || getClass() != o.getClass()) return false; OOVFactory that = (OOVFactory) o; - return leftId == that.leftId && rightId == that.rightId && cost == that.cost && posId == that.posId - && Objects.equals(wordInfo, that.wordInfo); + return leftId == that.leftId && rightId == that.rightId && cost == that.cost && posId == that.posId; } @Override public int hashCode() { - return Objects.hash(leftId, rightId, cost, posId, wordInfo); + return Objects.hash(leftId, rightId, cost, posId); } } } diff --git a/src/main/java/com/worksap/nlp/sudachi/MorphemeImpl.java b/src/main/java/com/worksap/nlp/sudachi/MorphemeImpl.java index 1cbf1849..93f49ee1 100644 --- a/src/main/java/com/worksap/nlp/sudachi/MorphemeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/MorphemeImpl.java @@ -24,7 +24,7 @@ class MorphemeImpl implements Morpheme { private final MorphemeList list; private final int index; - private LatticeNodeImpl node; + private LatticeNodeImpl node; // cache /* internal */ MorphemeImpl(MorphemeList list, int index) { this.list = list; diff --git a/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java b/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java index 35320078..bd091493 100644 --- a/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java @@ -73,12 +73,13 @@ public void setUp(Grammar grammar) throws IOException { * bit is set, it means that a node of length of 64 or greater * was created. * @param result - * OOV provider plugins need to add nodes here + * OOV provider plugins need to add nodes here. * @return the number of created nodes. Values outside that range will be * ignored. */ public abstract int provideOOV(InputText inputText, int offset, long otherWords, List result); + /** Runs provideOOV and set proper begin/end for each nodes. */ int getOOV(UTF8InputText inputText, int offset, long otherWords, List result) { int oldSize = result.size(); int numCreated = provideOOV(inputText, offset, otherWords, result); diff --git a/src/main/java/com/worksap/nlp/sudachi/WordId.java b/src/main/java/com/worksap/nlp/sudachi/WordId.java index 4dc40164..13636ab6 100644 --- a/src/main/java/com/worksap/nlp/sudachi/WordId.java +++ b/src/main/java/com/worksap/nlp/sudachi/WordId.java @@ -16,23 +16,44 @@ package com.worksap.nlp.sudachi; +/** + * Utility to handle combined word id. + * + * Combined word id (32 bits) consists of two parts, dictionary id (top 4 bit) + * and dictionary-internal word id (rest bits). + */ public class WordId { private WordId() { } - public static final int ID_BOS = 0xffff_fff0; - public static final int ID_EOS = 0xffff_fff1; - /** - * Internal word ids can't be larger than this number + * Internal word ids can't be larger than this number. */ - public static final int MAX_WORD_ID = 0x0fffffff; + public static final int MAX_WORD_ID = 0x0fff_ffff; /** - * Dictionary ids can't be larger than this number + * Dictionary ids can't be larger than this number. + * + * Dictionary id 0x0 is reserved for the system dictionary and 0xf is reserved + * for oov and special words. */ public static final int MAX_DIC_ID = 0xe; + // ids for special tokens. + public static final int ID_BOS = 0xffff_fff0; + public static final int ID_EOS = 0xffff_fff1; + public static final int ID_OOV_NOPOS = 0xf000_ffff; + + /** + * Make combined WordId from dictionary and internal parts, without checking + * bound. + * + * @param dic + * dictionary id. 0 is system, 1 and above are user. + * @param word + * word id inside the dictionary. + * @return combined word id. + */ public static int makeUnchecked(int dic, int word) { int dicPart = dicIdMask(dic); return dicPart | word; @@ -58,8 +79,13 @@ public static int make(int dic, int word) { return makeUnchecked(dic, word); } + /** Make OOV WordId from provided pos id. */ + public static int makeOov(short posId) { + return 0xf000_0000 | posId; + } + /** - * Extract dictionary number from the combined word id + * Extract dictionary id from the combined word id * * @param wordId * combined word id @@ -80,30 +106,34 @@ public static int word(int wordId) { return wordId & MAX_WORD_ID; } - public static int blendDic(int rawWordId, int actualDicId) { - int flag = dic(rawWordId); - return flag * actualDicId; - } - + /** + * Encode dictionary id as a part of combined word id. + */ public static int dicIdMask(int dicId) { return dicId << 28; } + /** + * Override dictionary part of the word id using given dicIdMask. + */ public static int applyMask(int wordId, int dicIdMask) { return (wordId & MAX_WORD_ID) | dicIdMask; } + public static int blendDic(int rawWordId, int actualDicId) { + int flag = dic(rawWordId); + return flag * actualDicId; + } + + /** @return if given word id represents OOV. */ public static boolean isOov(int wordId) { // low 16 bits are OOV POS, top 4 are 1s return (wordId & 0xffff_0000) == 0xf000_0000; } + /** @return if given word id represents special words. */ public static boolean isSpecial(int wordId) { // top 5 bits should be filled return (wordId & 0xf800_0000) == 0xf800_0000; } - - public static int oovWid(short posId) { - return 0xf000_0000 | posId; - } } diff --git a/src/main/java/com/worksap/nlp/sudachi/WordMask.java b/src/main/java/com/worksap/nlp/sudachi/WordMask.java index 523e1ad0..bc835afd 100644 --- a/src/main/java/com/worksap/nlp/sudachi/WordMask.java +++ b/src/main/java/com/worksap/nlp/sudachi/WordMask.java @@ -38,7 +38,8 @@ public static long addNth(long positions, int position) { } /** - * Create a word mask with nth position set + * Create a word mask with nth position set. If position > 64, set the highest + * bit instead. * * @param position * number of set position From 82604b892fe4e164e2bfe46dcdb3d6f0869f32b5 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 24 Jul 2024 17:19:02 +0900 Subject: [PATCH 33/94] fix WordParamters --- .../com/worksap/nlp/sudachi/dictionary/WordLookup.java | 2 +- .../worksap/nlp/sudachi/dictionary/WordParameters.java | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordLookup.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordLookup.java index 705f5f8a..16dbd07f 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordLookup.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordLookup.java @@ -78,7 +78,7 @@ public int[] outputBuffer(int length) { } /** - * Sets the wordIds, numWords, endOffset to the + * Sets the wordIds, numWords, endOffset to the next value. * * @return true if there was an entry in any of binary dictionaries */ diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameters.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameters.java index cfcbb6a9..01830674 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameters.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameters.java @@ -19,6 +19,8 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; +import com.worksap.nlp.sudachi.WordId; + public class WordParameters { private final ByteBuffer data; @@ -27,12 +29,12 @@ private WordParameters(ByteBuffer data) { } public long loadParams(int wordId) { - int addr = WordInfoList.wordId2offset(wordId); + int addr = WordInfoList.wordId2offset(WordId.word(wordId)); return data.getLong(addr); } public void setCost(int wordId, short cost) { - int addr = WordInfoList.wordId2offset(wordId) + 6; + int addr = WordInfoList.wordId2offset(WordId.word(wordId)) + 6; data.putShort(addr, cost); } @@ -48,7 +50,7 @@ public static WordParameters readWrite(ByteBuffer full, Description desc) { int lim = roBuf.limit(); ByteBuffer buf = ByteBuffer.allocate(lim); buf.order(ByteOrder.LITTLE_ENDIAN); - roBuf.put(buf); + buf.put(roBuf); buf.position(0); return new WordParameters(buf); } From a5ee6f5cd7c5d5619f5417ddfc662c4b7d3b4d3c Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 24 Jul 2024 17:24:28 +0900 Subject: [PATCH 34/94] change wordId in the test --- .../java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java index a3242735..1ee167c9 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java @@ -453,7 +453,7 @@ public void dumpInternalStructures() { assertThat(lattice.getJsonObject(i).isNull("begin"), is(true)); assertThat(lattice.getJsonObject(i).getInt("end"), is(0)); assertThat(lattice.getJsonObject(i).getString("headword"), is("(null)")); - assertThat(lattice.getJsonObject(i).getInt("wordId"), is(0)); + assertThat(lattice.getJsonObject(i).getInt("wordId"), is(WordId.ID_BOS)); assertThat(lattice.getJsonObject(i).getString("pos"), is("BOS/EOS")); assertThat(lattice.getJsonObject(i).getInt("rightId"), is(0)); assertThat(lattice.getJsonObject(i).getInt("leftId"), is(0)); @@ -466,7 +466,6 @@ public void dumpInternalStructures() { assertThat(lattice.getJsonObject(i).getInt("end"), is(3)); assertThat(lattice.getJsonObject(i).getString("headword"), is("東")); assertThat(lattice.getJsonObject(i).getString("pos"), is("名詞,普通名詞,一般,*,*,*")); - assertThat(lattice.getJsonObject(i).getInt("wordId"), is(4)); assertThat(lattice.getJsonObject(i).getInt("rightId"), is(7)); assertThat(lattice.getJsonObject(i).getInt("leftId"), is(7)); assertThat(lattice.getJsonObject(i).getInt("cost"), is(4675)); @@ -505,7 +504,7 @@ public void dumpInternalStructures() { assertThat(lattice.getJsonObject(i).getInt("begin"), is(9)); assertThat(lattice.getJsonObject(i).isNull("end"), is(true)); assertThat(lattice.getJsonObject(i).getString("headword"), is("(null)")); - assertThat(lattice.getJsonObject(i).getInt("wordId"), is(0)); + assertThat(lattice.getJsonObject(i).getInt("wordId"), is(WordId.ID_EOS)); assertThat(lattice.getJsonObject(i).getString("pos"), is("BOS/EOS")); assertThat(lattice.getJsonObject(i).getInt("rightId"), is(0)); assertThat(lattice.getJsonObject(i).getInt("leftId"), is(0)); From cf26ef0f4c6f5d8f9c788544a56d77237de46e41 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 24 Jul 2024 17:45:28 +0900 Subject: [PATCH 35/94] set oovFactory for test joinKatakanaOov plugin --- .../java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java | 4 ++++ .../com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java | 1 + 2 files changed, 5 insertions(+) diff --git a/src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java b/src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java index d00fc3cd..d0451973 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java @@ -64,6 +64,10 @@ public void setUp(Grammar grammar) { throw new IllegalArgumentException("minLength is negative"); } + setOovFactory(oovPosId); + } + + public void setOovFactory(short oovPosId) { factory = LatticeNodeImpl.oovFactory((short) -1, (short) -1, (short) -1, oovPosId); } diff --git a/src/test/java/com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java b/src/test/java/com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java index 40828bf3..baa34816 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java @@ -34,6 +34,7 @@ public void setUp() throws IOException { Dictionary dict = TestDictionary.INSTANCE.user1(); tokenizer = (JapaneseTokenizer) dict.create(); plugin = new JoinKatakanaOovPlugin(); + plugin.setOovFactory((short) -1); } @Test From 53a911cc4b1f46b25235168a91bf22ea5e3ca5e6 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 24 Jul 2024 18:06:43 +0900 Subject: [PATCH 36/94] fix join-numeric plugin --- .../com/worksap/nlp/sudachi/JoinNumericPlugin.java | 2 +- .../com/worksap/nlp/sudachi/PathRewritePlugin.java | 13 ++++++------- .../worksap/nlp/sudachi/JoinNumericPluginTest.java | 2 +- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/JoinNumericPlugin.java b/src/main/java/com/worksap/nlp/sudachi/JoinNumericPlugin.java index ad74ff43..40a24ac2 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JoinNumericPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/JoinNumericPlugin.java @@ -135,7 +135,7 @@ private void concat(List path, int begin, int end, Lattice latt return; if (enableNormalize) { String normalizedForm = parser.getNormalized(); - if (end - begin > 1 || !normalizedForm.equals(path.get(begin).getWordInfo().getNormalizedForm())) { + if (end - begin > 1 || !normalizedForm.equals(path.get(begin).getNormalizedForm())) { concatenate(path, begin, end, lattice, normalizedForm); } } else { diff --git a/src/main/java/com/worksap/nlp/sudachi/PathRewritePlugin.java b/src/main/java/com/worksap/nlp/sudachi/PathRewritePlugin.java index 161b2889..799d0a19 100644 --- a/src/main/java/com/worksap/nlp/sudachi/PathRewritePlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/PathRewritePlugin.java @@ -109,17 +109,16 @@ public LatticeNode concatenate(List path, int begin, int end, L StringBuilder dictionaryForm = new StringBuilder(); StringBuilder readingForm = new StringBuilder(); for (int i = begin; i < end; i++) { - WordInfo info = path.get(i).getWordInfo(); - surface.append(info.getSurface()); + LatticeNodeImpl node = path.get(i); + surface.append(node.getBaseSurface()); if (normalizedForm == null) { - normalizedFormBuilder.append(info.getNormalizedForm()); + normalizedFormBuilder.append(node.getNormalizedForm()); } - dictionaryForm.append(info.getDictionaryForm()); - readingForm.append(info.getReadingForm()); + dictionaryForm.append(node.getDictionaryForm()); + readingForm.append(node.getReading()); } - String s = surface.toString(); - LatticeNodeImpl node = LatticeNodeImpl.makeOov(b, e, posId, s, + LatticeNodeImpl node = LatticeNodeImpl.makeOov(b, e, posId, surface.toString(), (normalizedForm == null) ? normalizedFormBuilder.toString() : normalizedForm, dictionaryForm.toString(), readingForm.toString()); replaceNode(path, begin, end, node); diff --git a/src/test/java/com/worksap/nlp/sudachi/JoinNumericPluginTest.java b/src/test/java/com/worksap/nlp/sudachi/JoinNumericPluginTest.java index 4ea9a7a5..41f2b830 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JoinNumericPluginTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JoinNumericPluginTest.java @@ -68,7 +68,7 @@ public void testNormalize() { plugin.enableNormalize = true; List path = getPath("一二三万二千円"); assertEquals(2, path.size()); - assertEquals("1232000", path.get(0).getBaseSurface()); + assertEquals("1232000", path.get(0).getNormalizedForm()); } @Test From 97187c1da89a8b27d6ce97f5995d1184efde9e2a Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 24 Jul 2024 18:22:46 +0900 Subject: [PATCH 37/94] add direct access to text fields of node --- .../com/worksap/nlp/sudachi/LatticeImpl.java | 2 +- .../com/worksap/nlp/sudachi/LatticeNode.java | 20 +++++++++++ .../worksap/nlp/sudachi/LatticeNodeImpl.java | 8 +++-- .../nlp/sudachi/PathRewritePlugin.java | 4 +-- .../sudachi/JoinKatakanaOovPluginTest.java | 2 +- .../nlp/sudachi/JoinNumericPluginTest.java | 10 +++--- .../sudachi/MeCabOovProviderPluginTest.java | 34 +++++++++---------- .../java/com/worksap/nlp/sudachi/morphemes.kt | 2 +- 8 files changed, 53 insertions(+), 29 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/LatticeImpl.java b/src/main/java/com/worksap/nlp/sudachi/LatticeImpl.java index 3980e7f2..60dd6498 100644 --- a/src/main/java/com/worksap/nlp/sudachi/LatticeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/LatticeImpl.java @@ -187,7 +187,7 @@ List getBestPath() { } String getSurface(LatticeNodeImpl node) { - return node.isSpecial() ? "(null)" : node.getBaseSurface(); + return node.isSpecial() ? "(null)" : node.getSurface(); } String getPos(LatticeNodeImpl node) { diff --git a/src/main/java/com/worksap/nlp/sudachi/LatticeNode.java b/src/main/java/com/worksap/nlp/sudachi/LatticeNode.java index 85f0e5a7..b5cf5611 100644 --- a/src/main/java/com/worksap/nlp/sudachi/LatticeNode.java +++ b/src/main/java/com/worksap/nlp/sudachi/LatticeNode.java @@ -98,6 +98,26 @@ public interface LatticeNode { */ public WordInfo getWordInfo(); + /** + * @return the text of node. + */ + public String getSurface(); + + /** + * @return the reading form of node. + */ + public String getReading(); + + /** + * @return the normalized form of node. + */ + public String getNormalizedForm(); + + /** + * @return the dictionary form of node. + */ + public String getDictionaryForm(); + /** * Sets the morpheme information to the node. * diff --git a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java index fcc5dea9..35fee262 100644 --- a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java @@ -175,25 +175,29 @@ public boolean isConnectedToBOS() { return bestPreviousNode != null; } - public String getBaseSurface() { + @Override + public String getSurface() { return strings().getSurface(this); } + @Override public String getReading() { return strings().getReading(this); } + @Override public String getNormalizedForm() { return strings().getNormalizedForm(this); } + @Override public String getDictionaryForm() { return strings().getDictionaryForm(this); } @Override public String toString() { - String surface = getBaseSurface(); + String surface = getSurface(); short pos = getWordInfo().getPOSId(); return String.format("%d %d %s(%d) %d %d %d %d", getBegin(), getEnd(), surface, wordId, pos, leftId, rightId, diff --git a/src/main/java/com/worksap/nlp/sudachi/PathRewritePlugin.java b/src/main/java/com/worksap/nlp/sudachi/PathRewritePlugin.java index 799d0a19..3aa97a1c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/PathRewritePlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/PathRewritePlugin.java @@ -110,7 +110,7 @@ public LatticeNode concatenate(List path, int begin, int end, L StringBuilder readingForm = new StringBuilder(); for (int i = begin; i < end; i++) { LatticeNodeImpl node = path.get(i); - surface.append(node.getBaseSurface()); + surface.append(node.getSurface()); if (normalizedForm == null) { normalizedFormBuilder.append(node.getNormalizedForm()); } @@ -164,7 +164,7 @@ public LatticeNode concatenateOov(List path, int begin, int end StringBuilder surface = new StringBuilder(); for (int i = begin; i < end; i++) { - String s = path.get(i).getBaseSurface(); + String s = path.get(i).getSurface(); surface.append(s); } diff --git a/src/test/java/com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java b/src/test/java/com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java index baa34816..e7705314 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java @@ -86,7 +86,7 @@ public void testWithNOOOVBOW() { plugin.minLength = 3; List path = getPath("ァアイアイウ"); assertEquals(2, path.size()); - assertEquals("ァ", path.get(0).getBaseSurface()); + assertEquals("ァ", path.get(0).getSurface()); path = getPath("アイウァアイウ"); assertEquals(1, path.size()); diff --git a/src/test/java/com/worksap/nlp/sudachi/JoinNumericPluginTest.java b/src/test/java/com/worksap/nlp/sudachi/JoinNumericPluginTest.java index 41f2b830..cce4c843 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JoinNumericPluginTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JoinNumericPluginTest.java @@ -44,20 +44,20 @@ public void setUp() throws IOException { public void testDigit() { List path = getPath("123円20銭"); assertEquals(4, path.size()); - assertEquals("123", path.get(0).getBaseSurface()); - assertEquals("20", path.get(2).getBaseSurface()); + assertEquals("123", path.get(0).getSurface()); + assertEquals("20", path.get(2).getSurface()); path = getPath("080-121"); assertEquals(3, path.size()); - assertEquals("080", path.get(0).getBaseSurface()); - assertEquals("121", path.get(2).getBaseSurface()); + assertEquals("080", path.get(0).getSurface()); + assertEquals("121", path.get(2).getSurface()); } @Test public void testKanjiNumeric() { List path = getPath("一二三万二千円"); assertEquals(2, path.size()); - assertEquals("一二三万二千", path.get(0).getBaseSurface()); + assertEquals("一二三万二千", path.get(0).getSurface()); path = getPath("二百百"); assertEquals(3, path.size()); diff --git a/src/test/java/com/worksap/nlp/sudachi/MeCabOovProviderPluginTest.java b/src/test/java/com/worksap/nlp/sudachi/MeCabOovProviderPluginTest.java index b4eafb57..7625f906 100644 --- a/src/test/java/com/worksap/nlp/sudachi/MeCabOovProviderPluginTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/MeCabOovProviderPluginTest.java @@ -115,7 +115,7 @@ public void provideOOV010() { assertThat(nodes.size(), is(1)); LatticeNode n = nodes.get(0); - assertThat(n.getWordInfo().getSurface(), is("あいう")); + assertThat(n.getSurface(), is("あいう")); assertThat(n.getWordInfo().getLength(), is((short) 3)); assertThat(n.getWordInfo().getPOSId(), is((short) 1)); @@ -138,7 +138,7 @@ public void provideOOV110() { assertThat(nodes.size(), is(1)); LatticeNode n = nodes.get(0); - assertThat(n.getWordInfo().getSurface(), is("あいう")); + assertThat(n.getSurface(), is("あいう")); assertThat(n.getWordInfo().getLength(), is((short) 3)); assertThat(n.getWordInfo().getPOSId(), is((short) 1)); @@ -161,12 +161,12 @@ public void provideOOV002() { assertThat(nodes.size(), is(2)); LatticeNode n = nodes.get(0); - assertThat(n.getWordInfo().getSurface(), is("あ")); + assertThat(n.getSurface(), is("あ")); assertThat(n.getWordInfo().getLength(), is((short) 1)); assertThat(n.getWordInfo().getPOSId(), is((short) 1)); n = nodes.get(1); - assertThat(n.getWordInfo().getSurface(), is("あい")); + assertThat(n.getSurface(), is("あい")); assertThat(n.getWordInfo().getLength(), is((short) 2)); assertThat(n.getWordInfo().getPOSId(), is((short) 1)); @@ -190,12 +190,12 @@ public void provideOOV102() { assertThat(nodes.size(), is(2)); LatticeNode n = nodes.get(0); - assertThat(n.getWordInfo().getSurface(), is("あ")); + assertThat(n.getSurface(), is("あ")); assertThat(n.getWordInfo().getLength(), is((short) 1)); assertThat(n.getWordInfo().getPOSId(), is((short) 1)); n = nodes.get(1); - assertThat(n.getWordInfo().getSurface(), is("あい")); + assertThat(n.getSurface(), is("あい")); assertThat(n.getWordInfo().getLength(), is((short) 2)); assertThat(n.getWordInfo().getPOSId(), is((short) 1)); @@ -218,17 +218,17 @@ public void provideOOV012() { assertThat(nodes.size(), is(3)); LatticeNode n = nodes.get(0); - assertThat(n.getWordInfo().getSurface(), is("あいう")); + assertThat(n.getSurface(), is("あいう")); assertThat(n.getWordInfo().getLength(), is((short) 3)); assertThat(n.getWordInfo().getPOSId(), is((short) 1)); n = nodes.get(1); - assertThat(n.getWordInfo().getSurface(), is("あ")); + assertThat(n.getSurface(), is("あ")); assertThat(n.getWordInfo().getLength(), is((short) 1)); assertThat(n.getWordInfo().getPOSId(), is((short) 1)); n = nodes.get(2); - assertThat(n.getWordInfo().getSurface(), is("あい")); + assertThat(n.getSurface(), is("あい")); assertThat(n.getWordInfo().getLength(), is((short) 2)); assertThat(n.getWordInfo().getPOSId(), is((short) 1)); @@ -251,17 +251,17 @@ public void provideOOV112() { assertThat(nodes.size(), is(3)); LatticeNode n = nodes.get(0); - assertThat(n.getWordInfo().getSurface(), is("あいう")); + assertThat(n.getSurface(), is("あいう")); assertThat(n.getWordInfo().getLength(), is((short) 3)); assertThat(n.getWordInfo().getPOSId(), is((short) 1)); n = nodes.get(1); - assertThat(n.getWordInfo().getSurface(), is("あ")); + assertThat(n.getSurface(), is("あ")); assertThat(n.getWordInfo().getLength(), is((short) 1)); assertThat(n.getWordInfo().getPOSId(), is((short) 1)); n = nodes.get(2); - assertThat(n.getWordInfo().getSurface(), is("あい")); + assertThat(n.getSurface(), is("あい")); assertThat(n.getWordInfo().getLength(), is((short) 2)); assertThat(n.getWordInfo().getPOSId(), is((short) 1)); @@ -284,17 +284,17 @@ public void provideOOV006() { assertThat(nodes.size(), is(3)); LatticeNode n = nodes.get(0); - assertThat(n.getWordInfo().getSurface(), is("あ")); + assertThat(n.getSurface(), is("あ")); assertThat(n.getWordInfo().getLength(), is((short) 1)); assertThat(n.getWordInfo().getPOSId(), is((short) 1)); n = nodes.get(1); - assertThat(n.getWordInfo().getSurface(), is("あい")); + assertThat(n.getSurface(), is("あい")); assertThat(n.getWordInfo().getLength(), is((short) 2)); assertThat(n.getWordInfo().getPOSId(), is((short) 1)); n = nodes.get(2); - assertThat(n.getWordInfo().getSurface(), is("あいう")); + assertThat(n.getSurface(), is("あいう")); assertThat(n.getWordInfo().getLength(), is((short) 3)); assertThat(n.getWordInfo().getPOSId(), is((short) 1)); @@ -317,12 +317,12 @@ public void provideOOVMultiOOV() { assertThat(nodes.size(), is(2)); LatticeNode n = nodes.get(0); - assertThat(n.getWordInfo().getSurface(), is("あいう")); + assertThat(n.getSurface(), is("あいう")); assertThat(n.getWordInfo().getLength(), is((short) 3)); assertThat(n.getWordInfo().getPOSId(), is((short) 1)); n = nodes.get(1); - assertThat(n.getWordInfo().getSurface(), is("あいう")); + assertThat(n.getSurface(), is("あいう")); assertThat(n.getWordInfo().getLength(), is((short) 3)); assertThat(n.getWordInfo().getPOSId(), is((short) 2)); } diff --git a/src/test/java/com/worksap/nlp/sudachi/morphemes.kt b/src/test/java/com/worksap/nlp/sudachi/morphemes.kt index ac05f3c4..5c6cd5c6 100644 --- a/src/test/java/com/worksap/nlp/sudachi/morphemes.kt +++ b/src/test/java/com/worksap/nlp/sudachi/morphemes.kt @@ -25,7 +25,7 @@ fun DictionaryAccess.morpheme(id: Int): Morpheme { val l = MorphemeList( - UTF8InputTextBuilder(node.baseSurface, grammar).build(), + UTF8InputTextBuilder(node.surface, grammar).build(), grammar, lexicon, listOf(node), From aef5fbda0f2b319f4fb54825a851d2d9fd60f3ec Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Thu, 25 Jul 2024 16:57:50 +0900 Subject: [PATCH 38/94] fix system/user wordref resolution --- .../worksap/nlp/sudachi/LatticeNodeImpl.java | 18 +++-- .../com/worksap/nlp/sudachi/MorphemeList.java | 2 + .../java/com/worksap/nlp/sudachi/WordId.java | 42 +++++++++- .../nlp/sudachi/dictionary/LexiconSet.java | 3 + .../nlp/sudachi/dictionary/WordInfo.java | 32 +++++--- .../nlp/sudachi/dictionary/build/Lookup2.java | 79 ++++++++++++++----- .../sudachi/dictionary/build/RawLexicon.java | 70 ++++++++++------ .../dictionary/build/RawWordEntry.java | 32 -------- .../nlp/sudachi/dictionary/build/WordRef.java | 29 +++++-- 9 files changed, 206 insertions(+), 101 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java index 35fee262..5b08960f 100644 --- a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java @@ -222,6 +222,8 @@ private StringsCache strings() { appendSplitsTo(result, getWordInfo().getAunitSplit()); } else if (mode == Tokenizer.SplitMode.B) { appendSplitsTo(result, getWordInfo().getBunitSplit()); + } else if (mode == Tokenizer.SplitMode.C) { + appendSplitsTo(result, getWordInfo().getCunitSplit()); } else { result.add(this); } @@ -305,10 +307,10 @@ public String getNormalizedForm(LatticeNodeImpl node) { String s = normalizedForm; if (s == null) { WordInfo wi = node.getWordInfo(); - int dicEntryPtr = wi.getNormalizedForm(); - int dic = WordId.blendDic(dicEntryPtr, WordId.dic(node.wordId)); - int surface = lexicon.wordInfos(dic).surfacePtr(dicEntryPtr); - s = lexicon.string(dic, surface); + int wordref = wi.getNormalizedForm(); + int dic = WordId.refDic(wordref, WordId.dic(node.wordId)); + int surfacePtr = lexicon.wordInfos(dic).surfacePtr(WordId.word(wordref)); + s = lexicon.string(dic, surfacePtr); normalizedForm = s; } return s; @@ -318,10 +320,10 @@ public String getDictionaryForm(LatticeNodeImpl node) { String s = dictionaryForm; if (s == null) { WordInfo wi = node.getWordInfo(); - int dicEntryPtr = wi.getDictionaryForm(); - int dic = WordId.blendDic(dicEntryPtr, WordId.dic(node.wordId)); - int surface = lexicon.wordInfos(dic).surfacePtr(dicEntryPtr); - s = lexicon.string(dic, surface); + int wordref = wi.getDictionaryForm(); + int dic = WordId.refDic(wordref, WordId.dic(node.wordId)); + int surfacePtr = lexicon.wordInfos(dic).surfacePtr(WordId.word(wordref)); + s = lexicon.string(dic, surfacePtr); dictionaryForm = s; } return s; diff --git a/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java b/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java index 590da60a..81115ead 100644 --- a/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java +++ b/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java @@ -57,6 +57,7 @@ public int size() { return path.size(); } + /** Returns the begin index of the specified morpheme in the input text. */ int getBegin(int index) { int begin = inputText.getOriginalIndex(path.get(index).getBegin()); if (!allowEmptyMorpheme) { @@ -68,6 +69,7 @@ int getBegin(int index) { return begin; } + /** Returns the end index of the specified morpheme in the input text. */ int getEnd(int index) { int end = inputText.getOriginalIndex(path.get(index).getEnd()); if (!allowEmptyMorpheme) { diff --git a/src/main/java/com/worksap/nlp/sudachi/WordId.java b/src/main/java/com/worksap/nlp/sudachi/WordId.java index 13636ab6..a59832b3 100644 --- a/src/main/java/com/worksap/nlp/sudachi/WordId.java +++ b/src/main/java/com/worksap/nlp/sudachi/WordId.java @@ -120,9 +120,45 @@ public static int applyMask(int wordId, int dicIdMask) { return (wordId & MAX_WORD_ID) | dicIdMask; } - public static int blendDic(int rawWordId, int actualDicId) { - int flag = dic(rawWordId); - return flag * actualDicId; + /** Override dictionary part of the word id with given dic id. */ + public static int overrideDic(int wordId, int dicId) { + return applyMask(wordId, dicIdMask(dicId)); + } + + /** + * Resolve dic id to refer. + * + * @param wordRef + * word ref taken from word entry. + * @param actualDicId + * dic id of the dict which the word entry comes from. + * @return dic id which the wordid referring to. + */ + public static int refDic(int wordRef, int actualDicId) { + // 1 if wordref refers to the entry inside same dict, 0 otherwise (i.e. refers + // to system dict entry) + boolean isReferringUser = dic(wordRef) == 1; + if (isReferringUser) { + return actualDicId; + } + return 0; // system dict id + } + + /** + * Fill flag part of word ref with actual dic id. + * + * @param wordRef + * word ref taken from word entry. + * @param actualDicId + * dic id of the dict which the word entry comes from. + * @return dic id which the wordid referring to. + */ + public static int resolveRef(int wordRef, int actualDicId) { + boolean isReferringUser = dic(wordRef) == 1; + if (isReferringUser) { + return overrideDic(wordRef, actualDicId); + } + return wordRef; // dict part is 0 and thus no need to change. } /** @return if given word id represents OOV. */ diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java index 090fb709..db83275d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java @@ -105,12 +105,15 @@ public WordInfo getWordInfo(int wordId) { int dictionaryId = WordId.dic(wordId); int internalId = WordId.word(wordId); WordInfo wordInfo = lexicons.get(dictionaryId).getWordInfo(internalId); + + // resolve wordinfo internal data short posId = wordInfo.getPOSId(); if (dictionaryId > 0 && posId >= systemPartOfSpeechSize) { // user defined part-of-speech wordInfo.setPOSId((short) (wordInfo.getPOSId() - systemPartOfSpeechSize + posOffsets.get(dictionaryId))); } convertSplit(wordInfo.getAunitSplit(), dictionaryId); convertSplit(wordInfo.getBunitSplit(), dictionaryId); + convertSplit(wordInfo.getCunitSplit(), dictionaryId); convertSplit(wordInfo.getWordStructure(), dictionaryId); return wordInfo; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java index 05311a47..995290bd 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java @@ -29,10 +29,10 @@ public class WordInfo { private final short headwordLength; private short posId; - private final int surface; - private final int reading; - private final int normalizedForm; - private final int dictionaryForm; + private final int surface; // StringPtr + private final int reading; // StringPtr + private final int normalizedForm; // word ref + private final int dictionaryForm; // word ref private final int[] aUnitSplit; private final int[] bUnitSplit; private final int[] cUnitSplit; @@ -125,19 +125,24 @@ public void setPOSId(short posId) { } /** - * Returns the entry id of the normalized form of the morpheme. + * Returns the entry ref of the normalized form of the morpheme. The information + * of the dictionary form can be gotten with + * {@link com.worksap.nlp.sudachi.WordId#resolveRef} and + * {@link Lexicon#getWordInfo}. * - * @return the normalized form of the morpheme + * @return the word ref of the normalized form of the morpheme */ public int getNormalizedForm() { return normalizedForm; } /** - * Returns the word ID of the dictionary form of the morpheme. The information - * of the dictionary form can be gotten with {@link Lexicon#getWordInfo} + * Returns the entry ref of the dictionary form of the morpheme. The information + * of the dictionary form can be gotten with + * {@link com.worksap.nlp.sudachi.WordId#resolveRef} and + * {@link Lexicon#getWordInfo}. * - * @return the word ID of the dictionary form of the morpheme + * @return the word ref of the dictionary form of the morpheme */ public int getDictionaryForm() { return dictionaryForm; @@ -171,6 +176,15 @@ public int[] getBunitSplit() { return bUnitSplit; } + /** + * Returns the array of word IDs which the morpheme is compounded of in C mode. + * + * @return the word IDs of C units + */ + public int[] getCunitSplit() { + return cUnitSplit; + } + /** * Returns the array of the morphemes which the morpheme is compounded of. * diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java index 4a6a44ff..d03eba98 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java @@ -22,31 +22,65 @@ import java.util.Map; /** - * Utility to look up entries from the list. + * Utility to lookup entries from the list. */ public class Lookup2 { public interface Entry { + /** @return wordid of the entry. */ int pointer(); + /** @return if has given pos and reading. */ boolean matches(short posId, String reading); + /** @return headword of the entry. */ String headword(); } - private final List entries; - // number of reference system dictionary entries. only used to resolve user - // line-no ref. - private final int nbuiltin; + /** Wrapper class to distinguish if the entry is system or user. */ + public class EntryWithFlag implements Entry { + private Entry entry; + boolean isUser; + + EntryWithFlag(Entry entry, boolean isUser) { + this.entry = entry; + this.isUser = isUser; + } + + @Override + public int pointer() { + return entry.pointer(); + } + + @Override + public boolean matches(short posId, String reading) { + return entry.matches(posId, reading); + } + + @Override + public String headword() { + return entry.headword(); + } + } + + // entries + private final List systemEntries; + private final List userEntries; // mapping to entries that have same surfaces - private final Map> bySurface; - - public Lookup2(List entries, int nbuiltin) { - this.entries = entries; - this.nbuiltin = nbuiltin; - HashMap> result = new HashMap<>(entries.size() * 4 / 3); - for (Entry e : entries) { - List sublist = result.computeIfAbsent(e.headword(), x -> new ArrayList<>()); - sublist.add(e); + private final Map> bySurface; + + public Lookup2(List systemEntries, List userEntries) { + this.systemEntries = systemEntries; + this.userEntries = userEntries; + + HashMap> result = new HashMap<>( + (systemEntries.size() + userEntries.size()) * 4 / 3); + for (Entry e : systemEntries) { + List sublist = result.computeIfAbsent(e.headword(), x -> new ArrayList<>()); + sublist.add(new EntryWithFlag(e, false)); + } + for (Entry e : userEntries) { + List sublist = result.computeIfAbsent(e.headword(), x -> new ArrayList<>()); + sublist.add(new EntryWithFlag(e, true)); } bySurface = result; } @@ -56,11 +90,16 @@ public Lookup2(List entries, int nbuiltin) { * the list. * * @param index + * @param isUser + * if true, lookup from the user lecixons, otherwise from reference + * system dict. * @return */ - public Entry byIndex(int index, boolean isUser) { - int offset = isUser ? nbuiltin : 0; - return entries.get(index + offset); + public EntryWithFlag byIndex(int index, boolean isUser) { + if (isUser) { + return new EntryWithFlag(userEntries.get(index), true); + } + return new EntryWithFlag(systemEntries.get(index), false); } /** @@ -69,7 +108,7 @@ public Entry byIndex(int index, boolean isUser) { * @param headword * @return */ - public List byHeadword(String headword) { + public List byHeadword(String headword) { return bySurface.get(headword); } @@ -78,7 +117,7 @@ public List byHeadword(String headword) { * * @param e */ - public void add(Entry e) { - bySurface.computeIfAbsent(e.headword(), x -> new ArrayList<>()).add(e); + public void add(Entry e, boolean isUser) { + bySurface.computeIfAbsent(e.headword(), x -> new ArrayList<>()).add(new EntryWithFlag(e, isUser)); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index 022bbe92..b1bfc8a3 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -31,8 +31,6 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.Stream; /** * Dictionary part: Lexicon loaded from csv files. @@ -47,20 +45,19 @@ public class RawLexicon { // full list of word entries, in the order in csv. private final List entries = new ArrayList<>(); + // entries loaded from the referencing system dictionary (for user + // dict build). + private final List preloadedEntries = new ArrayList<>(); private final Index index = new Index(); private final List notIndexed = new ArrayList<>(); private final StringStorage strings = new StringStorage(); - private boolean user = false; + private boolean isUser = false; // offset for next entry private long offset = INITIAL_OFFSET; private boolean runtimeCosts = false; - // entries loaded from the referencing system dictionary (for user - // dict build). - private final List preloadedEntries = new ArrayList<>(); - /** * Preload entries from the lexicon (of the system dictionary). They are only * used to resolve wordref. @@ -69,6 +66,8 @@ public class RawLexicon { * @return number of entries read. */ public int preloadFrom(Lexicon lexicon, Progress progress) { + this.isUser = true; + Ints allIds = new Ints(lexicon.size()); Iterator ids = lexicon.wordIds(0); while (ids.hasNext()) { @@ -82,11 +81,6 @@ public int preloadFrom(Lexicon lexicon, Progress progress) { return preloadedEntries.size(); } - /** Full list of entries in referencing system and target lexicon. */ - private List lookupEntries() { - return Stream.concat(preloadedEntries.stream(), entries.stream()).collect(Collectors.toList()); - } - /** * Read lexicon from InputStream. * @@ -110,7 +104,7 @@ public void read(String name, InputStream data, POSTable posTable) throws IOExce public void read(String name, Reader data, POSTable posTable) throws IOException { CSVParser parser = new CSVParser(data); parser.setName(name); - RawLexiconReader reader = new RawLexiconReader(parser, posTable, user); + RawLexiconReader reader = new RawLexiconReader(parser, posTable, isUser); long offset = this.offset; RawWordEntry entry; @@ -164,11 +158,19 @@ public void compile(POSTable pos, BlockLayout layout) throws IOException { layout.block(Blocks.ENTRIES, (p) -> writeEntries(pos, p)); } + private Void writeStrings(BlockOutput blockOutput) throws IOException { + return blockOutput.measured("Strings", (p) -> { + strings.compile(p); + strings.writeCompact(blockOutput.getChannel()); + return null; + }); + } + private Void writeEntries(POSTable pos, BlockOutput blockOutput) throws IOException { return blockOutput.measured("Word Entries", (p) -> { List list = entries; - Lookup2 lookup = new Lookup2(lookupEntries(), preloadedEntries.size()); - WordRef.Parser refParser = WordRef.parser(pos, !user, false, false); + Lookup2 lookup = isUser ? new Lookup2(preloadedEntries, list) : new Lookup2(list, new ArrayList<>()); + WordRef.Parser refParser = WordRef.parser(pos, true, false, false); BufferedChannel buf = new BufferedChannel(blockOutput.getChannel(), WordEntryLayout.MAX_LENGTH * 4); buf.position(INITIAL_OFFSET); WordEntryLayout layout = new WordEntryLayout(lookup, strings, refParser, buf); @@ -180,7 +182,7 @@ private Void writeEntries(POSTable pos, BlockOutput blockOutput) throws IOExcept throw new IllegalStateException("expected entry pointer != actual pointer, i=" + i); } // size may increases with phantom entry - size += e.addPhantomEntries(list, lookup); + size += addPhantomEntries(e, list, lookup); ptr = layout.put(e); p.progress(i, size); } @@ -189,12 +191,36 @@ private Void writeEntries(POSTable pos, BlockOutput blockOutput) throws IOExcept }); } - private Void writeStrings(BlockOutput blockOutput) throws IOException { - return blockOutput.measured("Strings", (p) -> { - strings.compile(p); - strings.writeCompact(blockOutput.getChannel()); - return null; - }); + /** + * Add surface-only entry to access via normalized_form reference if necessary. + * + * @param list + * @param lookup + * @return 1 if phantom entry added, 0 otherwise + */ + private int addPhantomEntries(RawWordEntry entry, List list, Lookup2 lookup) { + if (entry.normalizedForm instanceof WordRef.Headword) { + WordRef.Headword ref = (WordRef.Headword) entry.normalizedForm; + if (lookup.byHeadword(ref.getHeadword()) != null) { + return 0; + } + RawWordEntry copy = new RawWordEntry(); + copy.headword = ref.getHeadword(); + copy.reading = copy.headword; + copy.userData = ""; + copy.leftId = -1; + copy.rightId = -1; + copy.cost = Short.MAX_VALUE; + copy.mode = "A"; + copy.posId = entry.posId; + RawWordEntry last = list.get(list.size() - 1); + copy.pointer = RawLexicon.pointer(WordInfoList.wordId2offset(last.pointer) + last.computeExpectedSize()); + list.add(copy); + lookup.add(copy, isUser); + return 1; + } else { + return 0; + } } /** @return number of entries in the TRIE index */ diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java index 16a74242..e19449ae 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java @@ -133,36 +133,4 @@ public void publishStrings(StringStorage strings) { strings.add(normalized.getHeadword()); } } - - /** - * Add surface-only entry to access via normalized_form reference if necessary. - * - * @param list - * @param lookup - * @return 1 if phantom entry added, 0 otherwise - */ - public int addPhantomEntries(List list, Lookup2 lookup) { - if (normalizedForm instanceof WordRef.Headword) { - WordRef.Headword ref = (WordRef.Headword) normalizedForm; - if (lookup.byHeadword(ref.getHeadword()) != null) { - return 0; - } - RawWordEntry copy = new RawWordEntry(); - copy.headword = ref.getHeadword(); - copy.reading = copy.headword; - copy.userData = ""; - copy.leftId = -1; - copy.rightId = -1; - copy.cost = Short.MAX_VALUE; - copy.mode = "A"; - copy.posId = posId; - RawWordEntry last = list.get(list.size() - 1); - copy.pointer = RawLexicon.pointer(WordInfoList.wordId2offset(last.pointer) + last.computeExpectedSize()); - list.add(copy); - lookup.add(copy); - return 1; - } else { - return 0; - } - } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java index 82025b62..ae29dd4d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java @@ -17,11 +17,11 @@ package com.worksap.nlp.sudachi.dictionary.build; import com.worksap.nlp.sudachi.StringUtil; +import com.worksap.nlp.sudachi.WordId; import com.worksap.nlp.sudachi.dictionary.POS; import java.util.Arrays; import java.util.List; -import java.util.Objects; import java.util.regex.Pattern; /** @@ -31,6 +31,21 @@ public abstract class WordRef { /** resolve word ref into pointer (word id) using resolver. */ public abstract int resolve(Lookup2 resolver); + /** + * Encode the target entry as wordref. + * + * wordref (32 bits) has similar structure as combined word id, but its dict + * part contains a flag that indicates if the referencing entry is in the same + * dict or referencing system dict. + * + * @param entry + * to encode + * @return encoded wordref + */ + public int intoWordRef(Lookup2.EntryWithFlag entry) { + return WordId.make(entry.isUser ? 1 : 0, entry.pointer()); + } + /** * Reference written by line number of the lexicon csv file. */ @@ -49,7 +64,7 @@ public int getLine() { @Override public int resolve(Lookup2 resolver) { - return resolver.byIndex(line, isUser).pointer(); + return intoWordRef(resolver.byIndex(line, isUser)); } @Override @@ -74,8 +89,8 @@ public String getHeadword() { @Override public int resolve(Lookup2 resolver) { - List entries = resolver.byHeadword(headword); - return entries.get(0).pointer(); + List entries = resolver.byHeadword(headword); + return intoWordRef(entries.get(0)); } @Override @@ -112,10 +127,10 @@ public String getReading() { @Override public int resolve(Lookup2 resolver) { - List entries = resolver.byHeadword(headword); - for (Lookup2.Entry entry : entries) { + List entries = resolver.byHeadword(headword); + for (Lookup2.EntryWithFlag entry : entries) { if (entry.matches(posId, reading)) { - return entry.pointer(); + return intoWordRef(entry); } } return -1; From dd1d21e9ddc06736bfdb116fb207312777379ae0 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 26 Jul 2024 09:35:04 +0900 Subject: [PATCH 39/94] parse wordRef on csv load --- .../sudachi/dictionary/build/BufWriter.java | 3 +- .../sudachi/dictionary/build/CsvLexicon.java | 39 ++++++++--- .../sudachi/dictionary/build/RawLexicon.java | 14 ++-- .../dictionary/build/RawLexiconReader.java | 48 +++++++++++-- .../dictionary/build/RawWordEntry.java | 55 ++++++++------- .../dictionary/build/WordEntryLayout.java | 68 ++++++------------- .../nlp/sudachi/dictionary/build/WordRef.java | 30 ++++++++ .../dictionary/build/RawLexiconReaderTest.kt | 17 ++--- .../sudachi/dictionary/build/headers-all.csv | 2 +- 9 files changed, 173 insertions(+), 103 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java index 25db3cef..644a61e4 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java @@ -66,7 +66,7 @@ public BufWriter putVarint32(int val) { } /** - * Envode long as LEB128 + * Encode long as LEB128 * * @param val * value to encode @@ -90,6 +90,7 @@ private void putVarintSlow(long val) { putByte((byte) val); } + /** Encode int array of fixed length. */ public BufWriter putInts(Ints value, int length) { if (length <= 0) { return this; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java index fbee88e1..daa4597c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java @@ -18,8 +18,10 @@ import com.worksap.nlp.sudachi.StringUtil; import com.worksap.nlp.sudachi.WordId; +import com.worksap.nlp.sudachi.dictionary.Ints; import com.worksap.nlp.sudachi.dictionary.POS; import com.worksap.nlp.sudachi.dictionary.WordInfo; +import com.worksap.nlp.sudachi.dictionary.build.RawLexiconReader.Column; import java.io.IOException; import java.nio.ByteBuffer; @@ -40,8 +42,12 @@ public class CsvLexicon implements WriteDictionary { private final List entries = new ArrayList<>(); private WordIdResolver widResolver = null; + // temporal fix + private WordRef.Parser parser; + public CsvLexicon(POSTable pos) { posTable = pos; + parser = WordRef.parser(pos, false, true, false); } public void setResolver(WordIdResolver widResolver) { @@ -90,18 +96,33 @@ RawWordEntry parseLine(List cols) { POS pos = new POS(cols.get(5), cols.get(6), cols.get(7), cols.get(8), cols.get(9), cols.get(10)); short posId = posTable.getId(pos); - entry.aUnitSplitString = cols.get(15); - entry.bUnitSplitString = cols.get(16); - entry.wordStructureString = cols.get(17); - checkSplitInfoFormat(entry.aUnitSplitString); - checkSplitInfoFormat(entry.bUnitSplitString); - checkSplitInfoFormat(entry.wordStructureString); - if (cols.get(14).equals("A") && (!entry.aUnitSplitString.equals("*") || !entry.bUnitSplitString.equals("*"))) { + entry.aUnitSplit = parseWordRefs(cols.get(15)); + entry.bUnitSplit = parseWordRefs(cols.get(16)); + entry.wordStructure = parseWordRefs(cols.get(17)); + checkSplitInfoFormat(entry.aUnitSplit); + checkSplitInfoFormat(entry.bUnitSplit); + checkSplitInfoFormat(entry.wordStructure); + if (cols.get(14).equals("A") && (!entry.aUnitSplit.isEmpty() || !entry.bUnitSplit.isEmpty())) { throw new IllegalArgumentException("invalid splitting"); } return entry; } + private List parseWordRefs(String value) { + if (value == null || value.isEmpty() || "*".equals(value)) { + return new ArrayList<>(); + } + String[] parts = value.split("/"); + if (parts.length > Byte.MAX_VALUE) { + throw new IllegalArgumentException("reference list contained more than 127 entries: " + value); + } + List result = new ArrayList<>(parts.length); + for (String part : parts) { + result.add(parser.parse(part)); + } + return result; + } + int[] parseSynonymGids(String str) { if (str.equals("*")) { return new int[0]; @@ -129,8 +150,8 @@ int wordToId(String text) { return widResolver.lookup(headword, posId, reading); } - void checkSplitInfoFormat(String info) { - if (StringUtil.count(info, '/') + 1 > ARRAY_MAX_LENGTH) { + void checkSplitInfoFormat(List info) { + if (info.size() > ARRAY_MAX_LENGTH) { throw new IllegalArgumentException("too many units"); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index b1bfc8a3..eaeadfd3 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -155,7 +155,7 @@ public void compile(POSTable pos, BlockLayout layout) throws IOException { index.compile(layout, notIndexed); // entry layout requires stringstorage to be compiled beforehand. layout.block(Blocks.STRINGS, this::writeStrings); - layout.block(Blocks.ENTRIES, (p) -> writeEntries(pos, p)); + layout.block(Blocks.ENTRIES, this::writeEntries); } private Void writeStrings(BlockOutput blockOutput) throws IOException { @@ -166,14 +166,13 @@ private Void writeStrings(BlockOutput blockOutput) throws IOException { }); } - private Void writeEntries(POSTable pos, BlockOutput blockOutput) throws IOException { + private Void writeEntries(BlockOutput blockOutput) throws IOException { return blockOutput.measured("Word Entries", (p) -> { List list = entries; Lookup2 lookup = isUser ? new Lookup2(preloadedEntries, list) : new Lookup2(list, new ArrayList<>()); - WordRef.Parser refParser = WordRef.parser(pos, true, false, false); BufferedChannel buf = new BufferedChannel(blockOutput.getChannel(), WordEntryLayout.MAX_LENGTH * 4); buf.position(INITIAL_OFFSET); - WordEntryLayout layout = new WordEntryLayout(lookup, strings, refParser, buf); + WordEntryLayout layout = new WordEntryLayout(lookup, strings, buf); int size = list.size(); int ptr = pointer(INITIAL_OFFSET); for (int i = 0; i < size; ++i) { @@ -204,14 +203,9 @@ private int addPhantomEntries(RawWordEntry entry, List list, Looku if (lookup.byHeadword(ref.getHeadword()) != null) { return 0; } - RawWordEntry copy = new RawWordEntry(); + RawWordEntry copy = RawWordEntry.makeEmpty(); copy.headword = ref.getHeadword(); copy.reading = copy.headword; - copy.userData = ""; - copy.leftId = -1; - copy.rightId = -1; - copy.cost = Short.MAX_VALUE; - copy.mode = "A"; copy.posId = entry.posId; RawWordEntry last = list.get(list.size() - 1); copy.pointer = RawLexicon.pointer(WordInfoList.wordId2offset(last.pointer) + last.computeExpectedSize()); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index ee146b08..a66a7bf1 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -17,6 +17,7 @@ package com.worksap.nlp.sudachi.dictionary.build; import com.worksap.nlp.sudachi.dictionary.CSVParser; +import com.worksap.nlp.sudachi.dictionary.Ints; import com.worksap.nlp.sudachi.dictionary.POS; import java.io.IOException; @@ -54,6 +55,7 @@ public enum Column { private final POSTable posTable; private final WordRef.Parser normRefParser; // for normalized form private final WordRef.Parser dictRefParser; // for dictionary form + private final WordRef.Parser splitParser; // for splits public RawLexiconReader(CSVParser parser, POSTable pos, boolean user) throws IOException { this.parser = parser; @@ -62,9 +64,11 @@ public RawLexiconReader(CSVParser parser, POSTable pos, boolean user) throws IOE if (isLegacyColumnLayout()) { normRefParser = WordRef.parser(pos, false, true, false); dictRefParser = WordRef.parser(pos, true, true, true); + splitParser = WordRef.parser(pos, true, false, false); } else { normRefParser = WordRef.parser(pos, false, false, false); dictRefParser = WordRef.parser(pos, !user, false, false); + splitParser = WordRef.parser(pos, false, false, false); } } @@ -146,6 +150,40 @@ private short getShort(List data, Column column) { } } + /** parse specified column as Ints. */ + private Ints getInts(List data, Column column) { + String value = get(data, column, false); + if (value == null || value.isEmpty() || "*".equals(value)) { + return Ints.wrap(Ints.EMPTY_ARRAY); + } + String[] parts = value.split("/"); + if (parts.length > Byte.MAX_VALUE) { + throw new IllegalArgumentException("int list contained more than 127 entries: " + value); + } + Ints result = new Ints(parts.length); + for (String part : parts) { + result.append(Integer.parseInt(part)); + } + return result; + } + + /** parse specified column as WordRef list. */ + private List getWordRefs(List data, Column column, WordRef.Parser parser) { + String value = get(data, column, false); + if (value == null || value.isEmpty() || "*".equals(value)) { + return new ArrayList<>(); + } + String[] parts = value.split("/"); + if (parts.length > Byte.MAX_VALUE) { + throw new IllegalArgumentException("reference list contained more than 127 entries: " + value); + } + List result = new ArrayList<>(parts.length); + for (String part : parts) { + result.add(parser.parse(part)); + } + return result; + } + /** convert csv row to RawWordEntry */ private RawWordEntry convertEntry(List data) { RawWordEntry entry = new RawWordEntry(); @@ -166,11 +204,11 @@ private RawWordEntry convertEntry(List data) { entry.posId = posTable.getId(pos); entry.mode = get(data, Column.Mode, false); - entry.aUnitSplitString = get(data, Column.SplitA, false); - entry.bUnitSplitString = get(data, Column.SplitB, false); - entry.cUnitSplitString = get(data, Column.SplitC, false); - entry.wordStructureString = get(data, Column.WordStructure, false); - entry.synonymGroups = get(data, Column.SynonymGroups, false); + entry.aUnitSplit = getWordRefs(data, Column.SplitA, splitParser); + entry.bUnitSplit = getWordRefs(data, Column.SplitB, splitParser); + entry.cUnitSplit = getWordRefs(data, Column.SplitC, splitParser); + entry.wordStructure = getWordRefs(data, Column.WordStructure, splitParser); + entry.synonymGroups = getInts(data, Column.SynonymGroups); entry.userData = get(data, Column.UserData, true); entry.validate(); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java index e19449ae..0822d289 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java @@ -16,13 +16,13 @@ package com.worksap.nlp.sudachi.dictionary.build; -import com.worksap.nlp.sudachi.StringUtil; import com.worksap.nlp.sudachi.dictionary.StringPtr; -import com.worksap.nlp.sudachi.dictionary.WordInfo; import com.worksap.nlp.sudachi.dictionary.WordInfoList; +import com.worksap.nlp.sudachi.dictionary.Ints; import java.util.List; import java.util.Objects; +import java.util.ArrayList; /** * Raw word info entry parsed from the lexicon csv. @@ -34,11 +34,11 @@ public class RawWordEntry implements Lookup2.Entry { String reading; WordRef normalizedForm; WordRef dictionaryForm; - String aUnitSplitString; - String bUnitSplitString; - String cUnitSplitString; - String wordStructureString; - String synonymGroups; + List aUnitSplit; + List bUnitSplit; + List cUnitSplit; + List wordStructure; + Ints synonymGroups; String userData; String mode; short leftId; @@ -48,17 +48,6 @@ public class RawWordEntry implements Lookup2.Entry { int sourceLine; String sourceName; - private int countRefs(String data, String prev) { - if (data == null || data.isEmpty() || "*".equals(data) || data.equals(prev)) { - return 0; - } - int nsplits = StringUtil.count(data, '/') + 1; - if (nsplits > CsvLexicon.ARRAY_MAX_LENGTH) { - throw new CsvFieldException("maximum number of splits were exceeded"); - } - return nsplits; - } - /** * Compute expected size of word entry when put in the binary dictionary. This * function additionally validates length of split entries. @@ -68,11 +57,11 @@ private int countRefs(String data, String prev) { public int computeExpectedSize() { int size = 32; - size += countRefs(cUnitSplitString, "") * 4; - size += countRefs(bUnitSplitString, cUnitSplitString) * 4; - size += countRefs(aUnitSplitString, bUnitSplitString) * 4; - size += countRefs(wordStructureString, aUnitSplitString) * 4; - size += countRefs(synonymGroups, "") * 4; + size += cUnitSplit.size() * 4; + size += (bUnitSplit.equals(cUnitSplit)) ? 0 : bUnitSplit.size() * 4; + size += (aUnitSplit.equals(bUnitSplit)) ? 0 : aUnitSplit.size() * 4; + size += (wordStructure.equals(aUnitSplit)) ? 0 : wordStructure.size() * 4; + size += synonymGroups.length() * 4; if (userData.length() != 0) { size += 2 + userData.length() * 2; } @@ -133,4 +122,24 @@ public void publishStrings(StringStorage strings) { strings.add(normalized.getHeadword()); } } + + public static RawWordEntry makeEmpty() { + RawWordEntry entry = new RawWordEntry(); + entry.headword = ""; + entry.reading = ""; + // entry.normalizedForm + // entry.dictionaryForm + entry.aUnitSplit = new ArrayList<>(); + entry.bUnitSplit = new ArrayList<>(); + entry.cUnitSplit = new ArrayList<>(); + entry.wordStructure = new ArrayList<>(); + entry.synonymGroups = Ints.wrap(Ints.EMPTY_ARRAY); + entry.userData = ""; + entry.mode = "A"; + entry.leftId = -1; + entry.rightId = -1; + entry.cost = Short.MAX_VALUE; + entry.posId = 0; + return entry; + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java index 19d02efe..b0a60166 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java @@ -21,8 +21,6 @@ import com.worksap.nlp.sudachi.dictionary.WordInfoList; import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; import java.util.List; /** @@ -30,23 +28,22 @@ */ public class WordEntryLayout { private final StringIndex index; - private final WordRef.Parser wordRefParser; private final Lookup2 lookup; private final BufferedChannel buffer; + + // caches private final Ints aSplits = new Ints(16); private final Ints bSplits = new Ints(16); private final Ints cSplits = new Ints(16); private final Ints wordStructure = new Ints(16); - private final Ints synonymGroups = new Ints(16); public static final int MAX_LENGTH = 32 // minimum size + Byte.MAX_VALUE * Integer.BYTES * 5 // splits and synonyms + (Short.MAX_VALUE + 1) * Character.BYTES; // user data - public WordEntryLayout(Lookup2 resolver, StringIndex index, WordRef.Parser parser, BufferedChannel buffer) { + public WordEntryLayout(Lookup2 resolver, StringIndex index, BufferedChannel buffer) { this.lookup = resolver; this.index = index; - this.wordRefParser = parser; this.buffer = buffer; } @@ -82,11 +79,11 @@ public int put(RawWordEntry entry) throws IOException { // length can't be more than ~4k utf-16 code units so the cast is safe short utf8Len = (short) StringUtil.countUtf8Bytes(entry.headword); - byte cSplitLen = parseWordRefList(entry.cUnitSplitString, "", cSplits); - byte bSplitLen = parseWordRefList(entry.bUnitSplitString, entry.cUnitSplitString, bSplits); - byte aSplitLen = parseWordRefList(entry.aUnitSplitString, entry.bUnitSplitString, aSplits); - byte wordStructureLen = parseWordRefList(entry.wordStructureString, entry.aUnitSplitString, wordStructure); - byte synonymLen = parseIntList(entry.synonymGroups, synonymGroups); + byte cSplitLen = resolveWordRefList(entry.cUnitSplit, null, cSplits); + byte bSplitLen = resolveWordRefList(entry.bUnitSplit, entry.cUnitSplit, bSplits); + byte aSplitLen = resolveWordRefList(entry.aUnitSplit, entry.bUnitSplit, aSplits); + byte wordStructureLen = resolveWordRefList(entry.wordStructure, entry.aUnitSplit, wordStructure); + byte synonymLen = (byte) entry.synonymGroups.length(); int userDataLength = entry.userData.length(); buf.putShort(utf8Len); buf.putByte(cSplitLen); @@ -102,7 +99,7 @@ public int put(RawWordEntry entry) throws IOException { buf.putInts(bSplits, bSplitLen); buf.putInts(aSplits, aSplitLen); buf.putInts(wordStructure, wordStructureLen); - buf.putInts(synonymGroups, synonymLen); + buf.putInts(entry.synonymGroups, synonymLen); if (userDataLength != 0) { buf.putShort((short) userDataLength); String userData = entry.userData; @@ -115,52 +112,31 @@ public int put(RawWordEntry entry) throws IOException { return RawLexicon.pointer(position); } - /** parse int list, i.e. synonym group ids */ - private byte parseIntList(String data, Ints result) { - if (data == null || data.isEmpty() || "*".equals(data)) { - result.clear(); - return 0; - } - String[] parts = data.split("/"); - if (parts.length > Byte.MAX_VALUE) { - throw new IllegalArgumentException("reference list contained more than 127 entries: " + data); - } - result.clear(); - for (String part : parts) { - result.append(Integer.parseInt(part)); - } - return (byte) parts.length; - } - /** - * Parse word ref list, i.e. A/B/C split and word structure. + * Resolve wordref list (A/B/C split and word structure) using the Lookup and + * returns its length. * * If it is equivalent to the reference, return -1 without parsing. * - * @param data + * @param refs + * wordref list to resolve. * @param reference + * wordref list of higher split unit. * @param result - * @return + * Ints to save resolved wordrefs. + * @return -1 if equals to reference, otherwise length. */ - byte parseWordRefList(String data, String reference, Ints result) { - if (data == null || data.isEmpty() || "*".equals(data)) { - result.clear(); + private byte resolveWordRefList(List refs, List reference, Ints result) { + result.clear(); + if (refs.isEmpty()) { return 0; } - if (data.equals(reference)) { - result.clear(); + if (refs.equals(reference)) { return -1; } - - String[] parts = data.split("/"); - if (parts.length > Byte.MAX_VALUE) { - throw new IllegalArgumentException("reference list contained more than 127 entries: " + data); - } - result.clear(); - for (String part : parts) { - WordRef ref = wordRefParser.parse(part); + for (WordRef ref : refs) { result.append(ref.resolve(lookup)); } - return (byte) parts.length; + return (byte) refs.size(); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java index ae29dd4d..48fba347 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java @@ -71,6 +71,16 @@ public int resolve(Lookup2 resolver) { public String toString() { return String.format("WordRef/Line: %s%d", isUser ? "U" : "S", line); } + + @Override + public boolean equals(Object other) { + if (this == other) + return true; + if (other == null || getClass() != other.getClass()) + return false; + LineNo o = (LineNo) other; + return (line == o.line) && (isUser == o.isUser); + } } /** @@ -97,6 +107,16 @@ public int resolve(Lookup2 resolver) { public String toString() { return String.format("WordRef/Headword: %s", headword); } + + @Override + public boolean equals(Object other) { + if (this == other) + return true; + if (other == null || getClass() != other.getClass()) + return false; + Headword o = (Headword) other; + return headword.equals(o.headword); + } } /** @@ -140,6 +160,16 @@ public int resolve(Lookup2 resolver) { public String toString() { return String.format("WordRef: %s/%d/%s", headword, posId, reading); } + + @Override + public boolean equals(Object other) { + if (this == other) + return true; + if (other == null || getClass() != other.getClass()) + return false; + Triple o = (Triple) other; + return (headword.equals(o.headword)) && (posId == o.posId) && (reading.equals(o.reading)); + } } private static final Pattern NUMERIC_RE = Pattern.compile("^U?\\d+$"); diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt index 3ccc791d..a1a11c16 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt @@ -22,6 +22,7 @@ import kotlin.test.Test import kotlin.test.assertEquals import kotlin.test.assertNotNull import kotlin.test.assertNull +import kotlin.test.assertTrue class RawLexiconReaderTest { companion object { @@ -37,8 +38,8 @@ class RawLexiconReaderTest { assertNotNull(reader.nextEntry()).let { e -> assertEquals("東京都", e.headword) assertEquals("トウキョウト", e.reading) - assertEquals("5/9", e.wordStructureString) - assertEquals("", e.cUnitSplitString) + assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(9, false)), e.wordStructure) + assertTrue(e.cUnitSplit.isEmpty()) assertEquals("", e.userData) } assertNull(reader.nextEntry()) @@ -50,8 +51,8 @@ class RawLexiconReaderTest { assertNotNull(reader.nextEntry()).let { e -> assertEquals("東京都", e.headword) assertEquals("トウキョウト", e.reading) - assertEquals("5/9", e.wordStructureString) - assertEquals("8/9", e.cUnitSplitString) + assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(9, false)), e.wordStructure) + assertEquals(listOf(WordRef.LineNo(8, false), WordRef.LineNo(9, false)), e.cUnitSplit) assertEquals("10", e.userData) } assertNull(reader.nextEntry()) @@ -65,10 +66,10 @@ class RawLexiconReaderTest { assertNotNull(reader.nextEntry()).let { e -> assertEquals("東京都", e.headword) assertEquals("トウキョウト", e.reading) - assertEquals("5/9", e.aUnitSplitString) - assertEquals("5/10", e.bUnitSplitString) - assertEquals("5/11", e.cUnitSplitString) - assertEquals("6/7", e.wordStructureString) + assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(9, false)), e.aUnitSplit) + assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(10, false)), e.bUnitSplit) + assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(11, false)), e.cUnitSplit) + assertEquals(listOf(WordRef.LineNo(6, false), WordRef.LineNo(7, false)), e.wordStructure) assertEquals("10", e.userData) } assertNull(reader.nextEntry()) diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv index 9b4b3ab3..6e48f820 100644 --- a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv @@ -1,2 +1,2 @@ Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,mode,splita,splitb,splitc,wordstructure,synonymgroups,userdata -東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,*,*,B,5/9,5/10,5/11,6/7,8/9,10 \ No newline at end of file +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,B,5/9,5/10,5/11,6/7,8/9,10 \ No newline at end of file From e9b516e1dca54be3c7bc44e87d04b19de8cd0e54 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 26 Jul 2024 13:59:29 +0900 Subject: [PATCH 40/94] slice dict at lexicon and fix test --- .../nlp/sudachi/dictionary/DoubleArrayLexicon.java | 4 ++-- .../nlp/sudachi/dictionary/WordParameters.java | 8 ++------ .../java/com/worksap/nlp/sudachi/TestDictionary.kt | 13 ++++++++----- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java index 58070c6b..8c52cf5e 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java @@ -51,9 +51,9 @@ public static DoubleArrayLexicon load(ByteBuffer bytes, Description header) { WordParameters parms; if (header.isRuntimeCosts()) { - parms = WordParameters.readWrite(bytes, header); + parms = WordParameters.readWrite(header.slice(bytes, Blocks.ENTRIES)); } else { - parms = WordParameters.readOnly(bytes, header); + parms = WordParameters.readOnly(header.slice(bytes, Blocks.ENTRIES)); } WordIdTable idTable = new WordIdTable(header.slice(bytes, Blocks.WORD_POINTERS)); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameters.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameters.java index 01830674..8405d3bc 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameters.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameters.java @@ -38,15 +38,11 @@ public void setCost(int wordId, short cost) { data.putShort(addr, cost); } - public static WordParameters readOnly(ByteBuffer full, Description desc) { - ByteBuffer data = desc.slice(full, Blocks.ENTRIES); - data.order(ByteOrder.LITTLE_ENDIAN); + public static WordParameters readOnly(ByteBuffer data) { return new WordParameters(data); } - public static WordParameters readWrite(ByteBuffer full, Description desc) { - WordParameters ro = readOnly(full, desc); - ByteBuffer roBuf = ro.data; + public static WordParameters readWrite(ByteBuffer roBuf) { int lim = roBuf.limit(); ByteBuffer buf = ByteBuffer.allocate(lim); buf.order(ByteOrder.LITTLE_ENDIAN); diff --git a/src/test/java/com/worksap/nlp/sudachi/TestDictionary.kt b/src/test/java/com/worksap/nlp/sudachi/TestDictionary.kt index 1465aa20..f3f0f7df 100644 --- a/src/test/java/com/worksap/nlp/sudachi/TestDictionary.kt +++ b/src/test/java/com/worksap/nlp/sudachi/TestDictionary.kt @@ -38,17 +38,20 @@ object TestDictionary { chan } + val userDict2Data: MemChannel by lazy { + val chan = MemChannel() + DicBuilder.user().system(systemDict).lexicon(res("/dict/user2.csv")).build(chan) + chan + } + val systemDict: BinaryDictionary get() = BinaryDictionary.loadSystem(systemDictData.buffer()) val userDict1: BinaryDictionary get() = BinaryDictionary.loadUser(userDict1Data.buffer()) - val userDict2: BinaryDictionary by lazy { - val chan = MemChannel() - DicBuilder.user().system(systemDict).lexicon(res("/dict/user2.csv")).build(chan) - BinaryDictionary.loadUser(chan.buffer()) - } + val userDict2: BinaryDictionary + get() = BinaryDictionary.loadUser(userDict2Data.buffer()) fun user0Cfg(): Config { return Config.defaultConfig().clearUserDictionaries().systemDictionary(systemDict) From d3795199950900ae6180de67e4f1850a83d02d90 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 26 Jul 2024 16:20:29 +0900 Subject: [PATCH 41/94] fix wordref for user-self-reference --- .../nlp/sudachi/dictionary/WordInfo.java | 5 ++- .../sudachi/dictionary/build/RawLexicon.java | 2 +- .../dictionary/build/WordEntryLayout.java | 10 +++-- .../dictionary/DictionaryBuilderTest.java | 39 ++++++++++-------- .../dictionary/UserDictionaryBuilderTest.java | 40 ++++++++++++------- 5 files changed, 59 insertions(+), 37 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java index 995290bd..a16c0aba 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java @@ -82,9 +82,10 @@ public WordInfo(short headwordLength, short posId) { } /** - * Returns the text of the morpheme. + * Returns raw string pointer to the text of the morpheme. * - * @return the text of the morpheme + * @return raw string pointer to the text + * @see StringPtr */ public int getSurface() { return surface; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index eaeadfd3..48ee2ec6 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -172,7 +172,7 @@ private Void writeEntries(BlockOutput blockOutput) throws IOException { Lookup2 lookup = isUser ? new Lookup2(preloadedEntries, list) : new Lookup2(list, new ArrayList<>()); BufferedChannel buf = new BufferedChannel(blockOutput.getChannel(), WordEntryLayout.MAX_LENGTH * 4); buf.position(INITIAL_OFFSET); - WordEntryLayout layout = new WordEntryLayout(lookup, strings, buf); + WordEntryLayout layout = new WordEntryLayout(lookup, strings, buf, isUser); int size = list.size(); int ptr = pointer(INITIAL_OFFSET); for (int i = 0; i < size; ++i) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java index b0a60166..e56b3d2c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java @@ -17,6 +17,7 @@ package com.worksap.nlp.sudachi.dictionary.build; import com.worksap.nlp.sudachi.StringUtil; +import com.worksap.nlp.sudachi.WordId; import com.worksap.nlp.sudachi.dictionary.Ints; import com.worksap.nlp.sudachi.dictionary.WordInfoList; @@ -30,6 +31,7 @@ public class WordEntryLayout { private final StringIndex index; private final Lookup2 lookup; private final BufferedChannel buffer; + private final boolean isUser; // caches private final Ints aSplits = new Ints(16); @@ -41,10 +43,11 @@ public class WordEntryLayout { + Byte.MAX_VALUE * Integer.BYTES * 5 // splits and synonyms + (Short.MAX_VALUE + 1) * Character.BYTES; // user data - public WordEntryLayout(Lookup2 resolver, StringIndex index, BufferedChannel buffer) { + public WordEntryLayout(Lookup2 resolver, StringIndex index, BufferedChannel buffer, boolean isUser) { this.lookup = resolver; this.index = index; this.buffer = buffer; + this.isUser = isUser; } /** @@ -65,11 +68,12 @@ public int put(RawWordEntry entry) throws IOException { buf.putInt(index.resolve(entry.headword).encode()); // surfacePtr buf.putInt(index.resolve(entry.reading).encode()); // readingPtr - int normFormPtr = entry.pointer; + int selfWordRef = isUser ? WordId.make(1, entry.pointer) : entry.pointer; + int normFormPtr = selfWordRef; if (entry.normalizedForm != null) { normFormPtr = entry.normalizedForm.resolve(lookup); } - int dicFormPtr = entry.pointer; + int dicFormPtr = selfWordRef; if (entry.dictionaryForm != null) { dicFormPtr = entry.dictionaryForm.resolve(lookup); } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.java index 0a45c598..faba5251 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.java @@ -32,6 +32,8 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; +import com.worksap.nlp.sudachi.WordId; + public class DictionaryBuilderTest { @Rule @@ -52,13 +54,14 @@ public void commandLine() throws IOException { writer.write("東,-1,-1,0,東,名詞,普通名詞,一般,*,*,*,ヒガシ,ひがし,*,A,*,*,*,*\n"); writer.write("京都,0,0,0,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*,*\n"); } + int[] wordIds = { 4, 11, 15, 19 }; DictionaryBuilder.main(new String[] { "-o", outputFile.getPath(), "-m", matrixFile.getPath(), "-d", "test", inputFile.getPath() }); try (BinaryDictionary dictionary = new BinaryDictionary(outputFile.getPath())) { - Description header = dictionary.getDictionaryHeader(); + assertTrue(header.isSystemDictionary()); assertThat(header.getComment(), is("test")); Grammar grammar = dictionary.getGrammar(); @@ -68,33 +71,37 @@ public void commandLine() throws IOException { assertThat(grammar.getConnectCost((short) 0, (short) 0), is((short) 200)); Lexicon lexicon = dictionary.getLexicon(); - assertThat(lexicon.size(), is(3)); - long params = lexicon.parameters(0); + assertThat(lexicon.size(), is(4)); // 3 + phantom for "ひがし" + // first entry + int wordId = wordIds[0]; + long params = lexicon.parameters(wordId); assertThat(WordParameters.leftId(params), is((short) 0)); assertThat(WordParameters.cost(params), is((short) 0)); - WordInfo info = lexicon.getWordInfo(0); - assertThat(info.getSurface(), is("東京都")); - assertThat(info.getNormalizedForm(), is("東京都")); - assertThat(info.getDictionaryForm(), is(-1)); - assertThat(info.getReadingForm(), is("ヒガシキョウト")); + WordInfo info = lexicon.getWordInfo(wordId); + assertThat(lexicon.string(0, info.getSurface()), is("東京都")); + assertThat(info.getNormalizedForm(), is(WordId.make(0, wordId))); + assertThat(info.getDictionaryForm(), is(WordId.make(0, wordId))); + assertThat(lexicon.string(0, info.getReadingForm()), is("ヒガシキョウト")); assertThat(info.getPOSId(), is((short) 0)); - assertThat(info.getAunitSplit(), is(new int[] { 1, 2 })); + assertThat(info.getAunitSplit(), is(new int[] { wordIds[1], wordIds[2] })); assertThat(info.getBunitSplit().length, is(0)); assertThat(info.getSynonymGroupIds(), is(new int[] { 1, 2 })); Iterator i = lexicon.lookup("東京都".getBytes(StandardCharsets.UTF_8), 0); assertTrue(i.hasNext()); - assertThat(i.next(), is(new int[] { 0, "東京都".getBytes(StandardCharsets.UTF_8).length })); + assertThat(i.next(), is(new int[] { wordId, "東京都".getBytes(StandardCharsets.UTF_8).length })); assertFalse(i.hasNext()); - params = lexicon.parameters(1); + // second entry + wordId = wordIds[1]; + params = lexicon.parameters(wordId); assertThat(WordParameters.leftId(params), is((short) -1)); assertThat(WordParameters.cost(params), is((short) 0)); - info = lexicon.getWordInfo(1); - assertThat(info.getSurface(), is("東")); - assertThat(info.getNormalizedForm(), is("ひがし")); - assertThat(info.getDictionaryForm(), is(-1)); - assertThat(info.getReadingForm(), is("ヒガシ")); + info = lexicon.getWordInfo(wordId); + assertThat(lexicon.string(0, info.getSurface()), is("東")); + assertThat(info.getNormalizedForm(), is(WordId.make(0, wordIds[3]))); // phantom entry + assertThat(info.getDictionaryForm(), is(WordId.make(0, wordId))); + assertThat(lexicon.string(0, info.getReadingForm()), is("ヒガシ")); assertThat(info.getPOSId(), is((short) 1)); assertThat(info.getAunitSplit().length, is(0)); assertThat(info.getBunitSplit().length, is(0)); diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilderTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilderTest.java index 811fc860..5ca512f9 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilderTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilderTest.java @@ -28,6 +28,8 @@ import java.util.Iterator; import com.worksap.nlp.sudachi.TestDictionary; +import com.worksap.nlp.sudachi.WordId; + import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -57,42 +59,50 @@ public void commandLine() throws IOException { "東京都市,0,0,0,東京都市,名詞,固有名詞,地名,一般,*,*,ヒガシキョウトシ,東京都市,*,B,\"東,名詞,普通名詞,一般,*,*,*,ヒガシ/3/U1\",*,\"4/3/市,名詞,普通名詞,一般,*,*,*,シ\",*\n"); writer.write("市,-1,-1,0,市,名詞,普通名詞,一般,*,*,*,シ,市,*,A,*,*,*,*\n"); } + int[] wordIds = { 4, 11 }; + int[] systemWIs = { 4, 8, 12, 16, 21, 25, 29, 35, 39, 43 }; // first 10 UserDictionaryBuilder.main(new String[] { "-o", outputFile.getPath(), "-s", systemDictFile.getPath(), "-d", "test", inputFile.getPath() }); try (BinaryDictionary dictionary = new BinaryDictionary(outputFile.getPath())) { Description header = dictionary.getDictionaryHeader(); + assertTrue(header.isUserDictionary()); assertThat(header.getComment(), is("test")); Lexicon lexicon = dictionary.getLexicon(); assertThat(lexicon.size(), is(2)); - long param = lexicon.parameters(0); + // first entry + int wordId = wordIds[0]; + long param = lexicon.parameters(wordId); assertThat(WordParameters.leftId(param), is((short) 0)); assertThat(WordParameters.cost(param), is((short) 0)); - WordInfo info = lexicon.getWordInfo(0); - assertThat(info.getSurface(), is("東京都市")); - assertThat(info.getNormalizedForm(), is("東京都市")); - assertThat(info.getDictionaryForm(), is(-1)); - assertThat(info.getReadingForm(), is("ヒガシキョウトシ")); + WordInfo info = lexicon.getWordInfo(wordId); + assertThat(lexicon.string(0, info.getSurface()), is("東京都市")); + assertThat(info.getNormalizedForm(), is(WordId.make(1, wordId))); + assertThat(info.getDictionaryForm(), is(WordId.make(1, wordId))); + assertThat(lexicon.string(0, info.getReadingForm()), is("ヒガシキョウトシ")); assertThat(info.getPOSId(), is((short) 3)); - assertThat(info.getAunitSplit(), is(new int[] { 4, 3, 1 | (1 << 28) })); + assertThat(info.getAunitSplit(), is(new int[] { systemWIs[4], systemWIs[3], WordId.make(1, wordIds[1]) })); assertThat(info.getBunitSplit().length, is(0)); - assertThat(info.getWordStructure(), is(new int[] { 4, 3, 1 | (1 << 28) })); + assertThat(info.getWordStructure(), + is(new int[] { systemWIs[4], systemWIs[3], WordId.make(1, wordIds[1]) })); Iterator i = lexicon.lookup("東京都市".getBytes(StandardCharsets.UTF_8), 0); assertTrue(i.hasNext()); - assertThat(i.next(), is(new int[] { 0, "東京都市".getBytes(StandardCharsets.UTF_8).length })); + assertThat(i.next(), is(new int[] { wordId, "東京都市".getBytes(StandardCharsets.UTF_8).length })); assertFalse(i.hasNext()); - param = lexicon.parameters(1); + // second entry + wordId = wordIds[1]; + param = lexicon.parameters(wordId); assertThat(WordParameters.leftId(param), is((short) -1)); assertThat(WordParameters.cost(param), is((short) 0)); - info = lexicon.getWordInfo(1); - assertThat(info.getSurface(), is("市")); - assertThat(info.getNormalizedForm(), is("市")); - assertThat(info.getDictionaryForm(), is(-1)); - assertThat(info.getReadingForm(), is("シ")); + info = lexicon.getWordInfo(wordId); + assertThat(lexicon.string(0, info.getSurface()), is("市")); + assertThat(info.getNormalizedForm(), is(WordId.make(1, wordId))); + assertThat(info.getDictionaryForm(), is(WordId.make(1, wordId))); + assertThat(lexicon.string(0, info.getReadingForm()), is("シ")); assertThat(info.getPOSId(), is((short) 4)); assertThat(info.getAunitSplit().length, is(0)); assertThat(info.getBunitSplit().length, is(0)); From 498297fe328fa8514fb91ed0be183067fcc0c4f2 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 26 Jul 2024 17:44:32 +0900 Subject: [PATCH 42/94] Fix RefexOovProvider --- src/main/java/com/worksap/nlp/sudachi/RegexOovProvider.java | 5 +++-- .../java/com/worksap/nlp/sudachi/RegexOovProviderTest.kt | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/RegexOovProvider.java b/src/main/java/com/worksap/nlp/sudachi/RegexOovProvider.java index df654a0e..612a5972 100644 --- a/src/main/java/com/worksap/nlp/sudachi/RegexOovProvider.java +++ b/src/main/java/com/worksap/nlp/sudachi/RegexOovProvider.java @@ -18,7 +18,6 @@ import com.worksap.nlp.sudachi.dictionary.Grammar; import com.worksap.nlp.sudachi.dictionary.POS; -import com.worksap.nlp.sudachi.dictionary.WordInfo; import java.io.IOException; import java.util.List; @@ -122,7 +121,9 @@ public int provideOOV(InputText inputText, int offset, long otherWords, List Date: Fri, 26 Jul 2024 17:52:34 +0900 Subject: [PATCH 43/94] allow lineno ref for system-split lexicon field --- .../sudachi/dictionary/build/RawLexiconReader.java | 2 +- .../nlp/sudachi/dictionary/build/WordRef.java | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index a66a7bf1..5059d632 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -68,7 +68,7 @@ public RawLexiconReader(CSVParser parser, POSTable pos, boolean user) throws IOE } else { normRefParser = WordRef.parser(pos, false, false, false); dictRefParser = WordRef.parser(pos, !user, false, false); - splitParser = WordRef.parser(pos, false, false, false); + splitParser = WordRef.parser(pos, !user, false, false); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java index 48fba347..67fa88f4 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java @@ -175,21 +175,21 @@ public boolean equals(Object other) { private static final Pattern NUMERIC_RE = Pattern.compile("^U?\\d+$"); /** Alias of WordRef.Parser constructor. */ - public static Parser parser(POSTable posTable, boolean allowNumeric, boolean allowHeadword, + public static Parser parser(POSTable posTable, boolean allowLineNo, boolean allowHeadword, boolean allowNullAsterisk) { - return new Parser(posTable, allowNumeric, allowHeadword, allowNullAsterisk); + return new Parser(posTable, allowLineNo, allowHeadword, allowNullAsterisk); } /** Parser to parse wordref from a string in the lexicon field. */ public static class Parser { private final POSTable posTable; - private final boolean allowNumeric; + private final boolean allowLineNo; private final boolean allowHeadword; private final boolean allowNullAsterisk; - public Parser(POSTable posTable, boolean allowNumeric, boolean allowHeadword, boolean allowNullAsterisk) { + public Parser(POSTable posTable, boolean allowLineNo, boolean allowHeadword, boolean allowNullAsterisk) { this.posTable = posTable; - this.allowNumeric = allowNumeric; + this.allowLineNo = allowLineNo; this.allowHeadword = allowHeadword; this.allowNullAsterisk = allowNullAsterisk; } @@ -200,7 +200,7 @@ public WordRef parse(String text) { return null; } - if (allowNumeric && NUMERIC_RE.matcher(text).matches()) { + if (allowLineNo && NUMERIC_RE.matcher(text).matches()) { boolean isUser = text.charAt(0) == 'U'; int offset = isUser ? 1 : 0; int lineNum = Integer.parseInt(text.substring(offset)); From a646b0fefeeebaf2712e5926a5881c82a0fbb343 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 29 Jul 2024 16:25:11 +0900 Subject: [PATCH 44/94] WordRef.Triple throws exception when resolve target not found --- .../java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java index 67fa88f4..ba093b00 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java @@ -153,7 +153,7 @@ public int resolve(Lookup2 resolver) { return intoWordRef(entry); } } - return -1; + throw new IllegalArgumentException("matching entry not found for the " + this.toString()); } @Override From 8fb6bfbb5c03a624c225c9806b455ffaa6e8ff6e Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 29 Jul 2024 16:29:07 +0900 Subject: [PATCH 45/94] out-of-bound connection id throws exception on load --- .../nlp/sudachi/dictionary/build/DicBuilder.java | 4 +++- .../nlp/sudachi/dictionary/build/RawLexicon.java | 12 +++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java index abfc0175..1520ad7f 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java @@ -72,9 +72,11 @@ private T self() { */ public T lexicon(String name, IOSupplier input, long size) throws IOException { progress.startBlock(name, nanoTime(), Progress.Kind.BYTE); + short numLeft = connection.nonEmpty() ? connection.getNumLeft() : Short.MAX_VALUE; + short numRight = connection.nonEmpty() ? connection.getNumRight() : Short.MAX_VALUE; try (InputStream is = input.get()) { InputStream stream = new ProgressInputStream(is, size, progress); - lexicon.read(name, stream, pos); + lexicon.read(name, stream, pos, numLeft, numRight); } progress.endBlock(size, nanoTime()); return self(); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index 48ee2ec6..f4ab07df 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -89,8 +89,9 @@ public int preloadFrom(Lexicon lexicon, Progress progress) { * @param posTable * @throws IOException */ - public void read(String name, InputStream data, POSTable posTable) throws IOException { - read(name, new InputStreamReader(data, StandardCharsets.UTF_8), posTable); + public void read(String name, InputStream data, POSTable posTable, short numLeft, short numRight) + throws IOException { + read(name, new InputStreamReader(data, StandardCharsets.UTF_8), posTable, numLeft, numRight); } /** @@ -101,7 +102,7 @@ public void read(String name, InputStream data, POSTable posTable) throws IOExce * @param posTable * @throws IOException */ - public void read(String name, Reader data, POSTable posTable) throws IOException { + public void read(String name, Reader data, POSTable posTable, short numLeft, short numRight) throws IOException { CSVParser parser = new CSVParser(data); parser.setName(name); RawLexiconReader reader = new RawLexiconReader(parser, posTable, isUser); @@ -109,6 +110,11 @@ public void read(String name, Reader data, POSTable posTable) throws IOException long offset = this.offset; RawWordEntry entry; while ((entry = reader.nextEntry()) != null) { + if (entry.leftId >= numLeft || entry.rightId >= numRight) { + throw new IllegalArgumentException(String.format("connection id out of range: %d, %d (line %d of %s)", + entry.leftId, entry.rightId, entry.sourceLine, entry.sourceName)); + } + entry.publishStrings(strings); entries.add(entry); entry.pointer = pointer(offset); From 3428b9e6db3736450e7fb9b3c6805b1b3633da05 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 29 Jul 2024 16:51:54 +0900 Subject: [PATCH 46/94] WordRef.Lineno with 'U' in system resolved to system entry --- .../java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java index d03eba98..d8a51f00 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java @@ -96,7 +96,8 @@ public Lookup2(List systemEntries, List userEn * @return */ public EntryWithFlag byIndex(int index, boolean isUser) { - if (isUser) { + // if userEntries is empty (i.e. building system), ignore isUser flag + if (isUser && !userEntries.isEmpty()) { return new EntryWithFlag(userEntries.get(index), true); } return new EntryWithFlag(systemEntries.get(index), false); From bb56b28028cc29d06fef259102d028d83fe6e273 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 29 Jul 2024 16:54:38 +0900 Subject: [PATCH 47/94] remove phantom entry from total entry count --- .../worksap/nlp/sudachi/dictionary/build/RawLexicon.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index f4ab07df..50247320 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -48,6 +48,7 @@ public class RawLexicon { // entries loaded from the referencing system dictionary (for user // dict build). private final List preloadedEntries = new ArrayList<>(); + private int nPhantomEntries = 0; private final Index index = new Index(); private final List notIndexed = new ArrayList<>(); @@ -217,6 +218,7 @@ private int addPhantomEntries(RawWordEntry entry, List list, Looku copy.pointer = RawLexicon.pointer(WordInfoList.wordId2offset(last.pointer) + last.computeExpectedSize()); list.add(copy); lookup.add(copy, isUser); + nPhantomEntries += 1; return 1; } else { return 0; @@ -225,12 +227,12 @@ private int addPhantomEntries(RawWordEntry entry, List list, Looku /** @return number of entries in the TRIE index */ public int getIndexedEntries() { - return this.entries.size() - this.notIndexed.size(); + return this.entries.size() - this.notIndexed.size() - nPhantomEntries; } /** @return number of all entries including non-indexed ones */ public int getTotalEntries() { - return this.entries.size(); + return this.entries.size() - nPhantomEntries; } /** @return if lexicon has entries that need runtime cost caluculation */ From 95243d180a827e989c84f83f2ece3f6d9e03817b Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 29 Jul 2024 16:57:51 +0900 Subject: [PATCH 48/94] fix related tests --- .../dictionary/build/WordEntryLayout.java | 4 + .../worksap/nlp/sudachi/MorphemeImplTest.kt | 4 +- .../nlp/sudachi/dictionary/DescriptionTest.kt | 7 + .../dictionary/DictionaryBuilderTest.java | 6 +- .../dictionary/DictionaryHeaderTest.java | 4 + .../sudachi/dictionary/GrammarImplTest.java | 6 +- .../dictionary/build/CsvLexiconTest.kt | 2 + .../sudachi/dictionary/build/SystemDicTest.kt | 103 ++++++----- .../sudachi/dictionary/build/UserDicTest.kt | 43 ++--- .../java/com/worksap/nlp/sudachi/morphemes.kt | 16 ++ .../nlp/sudachi/dictionary/build/char.def | 167 ++++++++++++++++++ 11 files changed, 287 insertions(+), 75 deletions(-) create mode 100644 src/test/resources/com/worksap/nlp/sudachi/dictionary/build/char.def diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java index e56b3d2c..d73b9f8d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java @@ -136,6 +136,10 @@ private byte resolveWordRefList(List refs, List reference, Int return 0; } if (refs.equals(reference)) { + // this cannot capture the case different WordRef subclass refers to the same + // entry. + // allow this behaviour for the compatibility to + // {@link RawWordEntry.computeExpectedSize}. return -1; } for (WordRef ref : refs) { diff --git a/src/test/java/com/worksap/nlp/sudachi/MorphemeImplTest.kt b/src/test/java/com/worksap/nlp/sudachi/MorphemeImplTest.kt index 4b2275b9..5b5bf740 100644 --- a/src/test/java/com/worksap/nlp/sudachi/MorphemeImplTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/MorphemeImplTest.kt @@ -23,9 +23,11 @@ class MorphemeImplTest { @Test fun useToString() { val dic = TestDictionary.user0() + // should be split into す/だ/ち, all of them are OOV val sudachi = dic.create().tokenize("すだち") + // wid of OOV is (0xf, posId) assertEquals( - "MorphemeImpl{begin=0, end=1, surface=す, pos=4/名詞,普通名詞,一般,*,*,*, wid=(0,0)}", + "MorphemeImpl{begin=0, end=1, surface=す, pos=4/名詞,普通名詞,一般,*,*,*, wid=(15,4)}", sudachi[0].toString()) } } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DescriptionTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DescriptionTest.kt index 131fa907..c315aaad 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DescriptionTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DescriptionTest.kt @@ -16,6 +16,7 @@ package com.worksap.nlp.sudachi.dictionary +import com.worksap.nlp.sudachi.TestDictionary import com.worksap.nlp.sudachi.dictionary.build.InMemoryChannel import kotlin.test.Test import kotlin.test.assertEquals @@ -42,4 +43,10 @@ class DescriptionTest { assertEquals(d.blocks[1].start, d2.blocks[1].start) assertEquals(d.blocks[1].size, d2.blocks[1].size) } + + @Test + fun getComment() { + val desc: Description = TestDictionary.systemDict.getDictionaryHeader() + assertEquals(desc.getComment(), "the system dictionary for the unit tests") + } } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.java index faba5251..be52c7e3 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.java @@ -54,7 +54,7 @@ public void commandLine() throws IOException { writer.write("東,-1,-1,0,東,名詞,普通名詞,一般,*,*,*,ヒガシ,ひがし,*,A,*,*,*,*\n"); writer.write("京都,0,0,0,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*,*\n"); } - int[] wordIds = { 4, 11, 15, 19 }; + int[] wordIds = { 4, 11, 15, 19 }; // 3 + phantom entry (ひがし) DictionaryBuilder.main(new String[] { "-o", outputFile.getPath(), "-m", matrixFile.getPath(), "-d", "test", inputFile.getPath() }); @@ -71,7 +71,7 @@ public void commandLine() throws IOException { assertThat(grammar.getConnectCost((short) 0, (short) 0), is((short) 200)); Lexicon lexicon = dictionary.getLexicon(); - assertThat(lexicon.size(), is(4)); // 3 + phantom for "ひがし" + assertThat(lexicon.size(), is(3)); // first entry int wordId = wordIds[0]; @@ -99,7 +99,7 @@ public void commandLine() throws IOException { assertThat(WordParameters.cost(params), is((short) 0)); info = lexicon.getWordInfo(wordId); assertThat(lexicon.string(0, info.getSurface()), is("東")); - assertThat(info.getNormalizedForm(), is(WordId.make(0, wordIds[3]))); // phantom entry + assertThat(info.getNormalizedForm(), is(WordId.make(0, wordIds[3]))); assertThat(info.getDictionaryForm(), is(WordId.make(0, wordId))); assertThat(lexicon.string(0, info.getReadingForm()), is("ヒガシ")); assertThat(info.getPOSId(), is((short) 1)); diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderTest.java index ca71c6d3..0af83c99 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderTest.java @@ -23,6 +23,7 @@ import com.worksap.nlp.sudachi.TestDictionary; import org.junit.Before; +import org.junit.Ignore; import org.junit.Test; public class DictionaryHeaderTest { @@ -34,16 +35,19 @@ public void setUp() throws IOException { } @Test + @Ignore public void getVersion() { assertEquals(DictionaryVersion.SYSTEM_DICT_VERSION_2, header.getVersion()); } @Test + @Ignore public void getCreateTime() { assertTrue(header.getCreateTime() > 0); } @Test + @Ignore public void getDescription() { assertEquals("the system dictionary for the unit tests", header.getDescription()); } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/GrammarImplTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/GrammarImplTest.java index 19584b62..3b7a1b4a 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/GrammarImplTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/GrammarImplTest.java @@ -98,10 +98,7 @@ public void getEOSParameter() { @Test public void readBytes() { - ByteBuffer bytes = TestDictionary.INSTANCE.getSystemDictData().buffer(); - DictionaryHeader header = new DictionaryHeader(bytes, 0); - - grammar = new GrammarImpl(bytes, header.storageSize()); + grammar = TestDictionary.INSTANCE.getSystemDict().getGrammar(); assertEquals(8, grammar.getPartOfSpeechSize()); @@ -110,7 +107,6 @@ public void readBytes() { assertEquals(126, grammar.getConnectCost((short) 3, (short) 6)); assertEquals(1180, grammar.getConnectCost((short) 7, (short) 2)); assertEquals(3319, grammar.getConnectCost((short) 5, (short) 7)); - assertEquals(470, grammar.storageSize()); } void buildPartOfSpeech() { diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexiconTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexiconTest.kt index bcf24418..d255ff6d 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexiconTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexiconTest.kt @@ -16,6 +16,7 @@ package com.worksap.nlp.sudachi.dictionary.build +import kotlin.test.Ignore import kotlin.test.Test import kotlin.test.assertEquals import kotlin.test.assertFails @@ -75,6 +76,7 @@ class CsvLexiconTest { } @Test + @Ignore fun failTooManyUnits() { val clex = CsvLexicon(POSTable()) val data = "東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,C,*,*,*,*".split(",") diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/SystemDicTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/SystemDicTest.kt index a70fb39d..d6ded7e4 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/SystemDicTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/SystemDicTest.kt @@ -17,9 +17,11 @@ package com.worksap.nlp.sudachi.dictionary.build import com.worksap.nlp.sudachi.dictionary.BinaryDictionary +import com.worksap.nlp.sudachi.dictionary.DictionaryAccess import com.worksap.nlp.sudachi.dictionary.POS import com.worksap.nlp.sudachi.morpheme import com.worksap.nlp.sudachi.res +import com.worksap.nlp.sudachi.setCharacterCategory import com.worksap.nlp.sudachi.wordInfo import kotlin.test.* @@ -51,20 +53,21 @@ class SystemDicTest { repeat(10) { bldr.lexicon(javaClass.getResource("one.csv")) } bldr.lexicon("南,1,1,4675,南,名詞,普通名詞,一般,*,*,*,ミナミ,西,5,C,0/1,2/3,4/5,6/7").build(data) val dic = BinaryDictionary(data.buffer()) - assertEquals(11, dic.lexicon.size()) + (dic as DictionaryAccess).setCharacterCategory(javaClass.getResource("char.def")) + assertEquals(11, dic.lexicon.size()) // 10 + 南 assertEquals(POS("名詞", "普通名詞", "一般", "*", "*", "*"), dic.grammar.getPartOfSpeechString(0)) - val m = dic.morpheme(10) + val m = dic.morpheme(44) // 11th word (i.e. 南) val wi = m.wordInfo - assertEquals(m.surface(), "南") - assertEquals(wi.length, 3) - assertEquals(wi.posId, 0) - assertEquals(m.dictionaryForm(), "南") - assertEquals(m.normalizedForm(), "西") - assertEquals(m.readingForm(), "ミナミ") - assertContentEquals(wi.aunitSplit, intArrayOf(0, 1)) - assertContentEquals(wi.bunitSplit, intArrayOf(2, 3)) - assertContentEquals(wi.wordStructure, intArrayOf(4, 5)) - assertContentEquals(m.synonymGroupIds, intArrayOf(6, 7)) + assertEquals("南", m.surface()) + assertEquals(3, wi.length) + assertEquals(0, wi.posId) + assertEquals("東", m.dictionaryForm()) + assertEquals("西", m.normalizedForm()) + assertEquals("ミナミ", m.readingForm()) + assertContentEquals(intArrayOf(4, 8), wi.aunitSplit) + assertContentEquals(intArrayOf(12, 16), wi.bunitSplit) + assertContentEquals(intArrayOf(20, 24), wi.wordStructure) + assertContentEquals(intArrayOf(6, 7), m.synonymGroupIds) } @Test @@ -73,13 +76,15 @@ class SystemDicTest { val data = MemChannel() bldr.lexicon("南,1,1,4675,南,名詞,普通名詞,一般,*,*,*,南,南,*,C,*,*,*,*").build(data) val dic = BinaryDictionary(data.buffer()) + (dic as DictionaryAccess).setCharacterCategory(javaClass.getResource("char.def")) + val wordIds = intArrayOf(4) assertEquals(1, dic.lexicon.size()) assertEquals(POS("名詞", "普通名詞", "一般", "*", "*", "*"), dic.grammar.getPartOfSpeechString(0)) - val m = dic.morpheme(0) - assertEquals(m.surface(), "南") - assertEquals(m.dictionaryForm(), "南") - assertEquals(m.normalizedForm(), "南") - assertEquals(m.readingForm(), "南") + val m = dic.morpheme(wordIds[0]) + assertEquals("南", m.surface()) + assertEquals("南", m.dictionaryForm()) + assertEquals("南", m.normalizedForm()) + assertEquals("南", m.readingForm()) } @Test @@ -96,14 +101,15 @@ class SystemDicTest { bldr .lexicon( """東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* - 東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,0/2,*,0/2,* - 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) +東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,0/2,*,0/2,* +都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) .build(data) + val wordIds = intArrayOf(4, 8, 13) val dic = BinaryDictionary(data.buffer()) assertEquals(3, dic.lexicon.size()) - val wi = dic.lexicon.getWordInfo(1) - assertContentEquals(wi.aunitSplit, intArrayOf(0, 2)) - assertContentEquals(wi.wordStructure, intArrayOf(0, 2)) + val wi = dic.lexicon.getWordInfo(wordIds[1]) + assertContentEquals(intArrayOf(wordIds[0], wordIds[2]), wi.aunitSplit) + assertContentEquals(intArrayOf(wordIds[0], wordIds[2]), wi.wordStructure) } @Test @@ -113,14 +119,15 @@ class SystemDicTest { bldr .lexicon( """東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* - 東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/2",*,0/2,* - 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) +東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/2",*,0/2,* +都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) .build(data) + val wordIds = intArrayOf(4, 8, 14) val dic = BinaryDictionary(data.buffer()) assertEquals(3, dic.lexicon.size()) - val wi = dic.lexicon.getWordInfo(1) - assertContentEquals(wi.aunitSplit, intArrayOf(0, 2)) - assertContentEquals(wi.wordStructure, intArrayOf(0, 2)) + val wi = dic.lexicon.getWordInfo(wordIds[1]) + assertContentEquals(intArrayOf(wordIds[0], wordIds[2]), wi.aunitSplit) + assertContentEquals(intArrayOf(wordIds[0], wordIds[2]), wi.wordStructure) } @Test @@ -130,14 +137,15 @@ class SystemDicTest { bldr .lexicon( """東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* - 東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,0/2,0/2,* - 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) +東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,0/2,0/2,* +都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) .build(data) + val wordIds = intArrayOf(4, 8, 14) val dic = BinaryDictionary(data.buffer()) assertEquals(3, dic.lexicon.size()) - val wi = dic.lexicon.getWordInfo(1) - assertContentEquals(wi.bunitSplit, intArrayOf(0, 2)) - assertContentEquals(wi.wordStructure, intArrayOf(0, 2)) + val wi = dic.lexicon.getWordInfo(wordIds[1]) + assertContentEquals(intArrayOf(wordIds[0], wordIds[2]), wi.bunitSplit) + assertContentEquals(intArrayOf(wordIds[0], wordIds[2]), wi.wordStructure) } @Test @@ -147,14 +155,15 @@ class SystemDicTest { bldr .lexicon( """東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* - 東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,0/2,U0/U2,* - 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) +東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,0/2,U0/U2,* +都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) .build(data) + val wordIds = intArrayOf(4, 8, 14) val dic = BinaryDictionary(data.buffer()) assertEquals(3, dic.lexicon.size()) - val wi = dic.lexicon.getWordInfo(1) - assertContentEquals(wi.bunitSplit, intArrayOf(0, 2)) - assertContentEquals(wi.wordStructure, intArrayOf(0, 2)) + val wi = dic.lexicon.getWordInfo(wordIds[1]) + assertContentEquals(intArrayOf(wordIds[0], wordIds[2]), wi.bunitSplit) + assertContentEquals(intArrayOf(wordIds[0], wordIds[2]), wi.wordStructure) } @Test @@ -167,8 +176,9 @@ class SystemDicTest { @Test fun failInvalidNumberOfInlineRefFields() { val bldr = DicBuilder.system().matrix(res("test.matrix")) - bldr.lexicon("""東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,"a,b,c,d,e",*,*""") - assertFails { bldr.build(MemChannel()) } + assertFails { + bldr.lexicon("""東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,"a,b,c,d,e",*,*""") + } } @Test @@ -176,7 +186,7 @@ class SystemDicTest { val bldr = DicBuilder.system().matrix(res("test.matrix")) bldr.lexicon( """東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* - 東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,"東京,名詞,固有名詞,地名,一般,*,*,a",*,*""".trimMargin()) +東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,*,"東京,名詞,固有名詞,地名,一般,*,*,a",*,*""".trimMargin()) assertFails { bldr.build(MemChannel()) } } @@ -193,8 +203,11 @@ class SystemDicTest { val ch = MemChannel() bldr.build(ch) val dic = BinaryDictionary(ch.buffer()) + (dic as DictionaryAccess).setCharacterCategory(javaClass.getResource("char.def")) assertEquals(dic.lexicon.size(), 101) + (0..100).forEach { i -> + val wordId = i * 4 + 4 val istr = String.format("%04x", i) val surf = "a".repeat(1024) + istr val read = "b".repeat(1024) + istr @@ -203,13 +216,13 @@ class SystemDicTest { val surfArray = surf.encodeToByteArray() val iter = dic.lexicon.lookup(surfArray, 0) assertTrue { iter.hasNext() } - assertContentEquals(intArrayOf(i, surfArray.size), iter.next()) + assertContentEquals(intArrayOf(wordId, surfArray.size), iter.next()) assertFalse { iter.hasNext() } - val wi = dic.morpheme(i) - assertEquals(wi.surface(), surf) - assertEquals(wi.readingForm(), read) - assertEquals(wi.normalizedForm(), norm) + val m = dic.morpheme(wordId) + assertEquals(surf, m.surface()) + assertEquals(read, m.readingForm()) + assertEquals(norm, m.normalizedForm()) } } } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt index a9c53e9d..ff172787 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt @@ -66,15 +66,15 @@ class UserDicTest { TestDic() .system( """東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* - 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) +都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) .user("""東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,0/1,*,0/1,*""".trimIndent()) .load() val da = dic as DictionaryAccess assertEquals(dic.partOfSpeechSize, 2) - val wi = da.morpheme(WordId.make(1, 0)) - assertEquals(wi.surface(), "東京都") - assertEquals(wi.readingForm(), "トウキョウト") + val m = da.morpheme(WordId.make(1, 4)) + assertEquals("東京都", m.surface()) + assertEquals("トウキョウト", m.readingForm()) } @Test @@ -84,13 +84,13 @@ class UserDicTest { .system("""東京,1,1,2816,東京,名詞,普通名詞,一般,*,*,*,トウキョウ,東京,*,A,*,*,*,*""".trimIndent()) .user( """東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,0/U1,0/U1,0/U1,* - 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) +都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) .load() val da = dic as DictionaryAccess - val wi = da.lexicon.getWordInfo(WordId.make(1, 0)) - assertContentEquals(intArrayOf(0, WordId.make(1, 1)), wi.aunitSplit) - assertContentEquals(intArrayOf(0, WordId.make(1, 1)), wi.bunitSplit) - assertContentEquals(intArrayOf(0, WordId.make(1, 1)), wi.wordStructure) + val wi = da.lexicon.getWordInfo(WordId.make(1, 4)) + assertContentEquals(intArrayOf(4, WordId.make(1, 9)), wi.aunitSplit) + assertContentEquals(intArrayOf(4, WordId.make(1, 9)), wi.bunitSplit) + assertContentEquals(intArrayOf(4, WordId.make(1, 9)), wi.wordStructure) } @Test @@ -100,11 +100,11 @@ class UserDicTest { .system("""東京,1,1,2816,東京,名詞,普通名詞,一般,*,*,*,トウキョウ,東京,*,A,*,*,*,*""".trimIndent()) .user( """東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,"東京,名詞,普通名詞,一般,*,*,*,トウキョウ/U1",*,*,* - 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) +都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) .load() val da = dic as DictionaryAccess - val wi = da.lexicon.getWordInfo(WordId.make(1, 0)) - assertContentEquals(intArrayOf(0, WordId.make(1, 1)), wi.aunitSplit) + val wi = da.lexicon.getWordInfo(WordId.make(1, 4)) + assertContentEquals(intArrayOf(4, WordId.make(1, 9)), wi.aunitSplit) } @Test @@ -114,11 +114,11 @@ class UserDicTest { .system("""東京,1,1,2816,東京,名詞,普通名詞,一般,*,*,*,トウキョウ,東京,*,A,*,*,*,*""".trimIndent()) .user( """東京都,2,2,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,"0/都,名詞,普通名詞,一般,*,*,*,ト",*,*,* - 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) +都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) .load() val da = dic as DictionaryAccess - val wi = da.lexicon.getWordInfo(WordId.make(1, 0)) - assertContentEquals(intArrayOf(0, WordId.make(1, 1)), wi.aunitSplit) + val wi = da.lexicon.getWordInfo(WordId.make(1, 4)) + assertContentEquals(intArrayOf(4, WordId.make(1, 9)), wi.aunitSplit) } @Test @@ -127,15 +127,16 @@ class UserDicTest { TestDic() .system( """東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* - 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) +都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) .user("""東京都,2,2,5320,東京都,a,b,c,d,e,f,トウキョウト,東京都,*,B,0/1,*,0/1,*""".trimIndent()) .load() val da = dic as DictionaryAccess - val wi = da.morpheme(WordId.make(1, 0)) assertEquals(dic.partOfSpeechSize, 3) - assertEquals(wi.surface(), "東京都") - assertEquals(wi.partOfSpeech(), "a,b,c,d,e,f".pos) + + val m = da.morpheme(WordId.make(1, 4)) + assertEquals("東京都", m.surface()) + assertEquals("a,b,c,d,e,f".pos, m.partOfSpeech()) } @Test @@ -144,7 +145,7 @@ class UserDicTest { TestDic() .system( """東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* - 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) +都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) assertFails { bldr.user("""東京都,2,2,5320,東京都,a,b,c,d,e,f,トウキョウト,東京都,*,B,5/1,*,*,*""".trimIndent()) @@ -157,7 +158,7 @@ class UserDicTest { TestDic() .system( """東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* - 都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) +都,2,2,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*""".trimIndent()) assertFails { bldr.user("""東京都,2,2,5320,東京都,a,b,c,d,e,f,トウキョウト,東京都,*,B,0/U1,*,*,*""".trimIndent()) diff --git a/src/test/java/com/worksap/nlp/sudachi/morphemes.kt b/src/test/java/com/worksap/nlp/sudachi/morphemes.kt index 5c6cd5c6..c9703102 100644 --- a/src/test/java/com/worksap/nlp/sudachi/morphemes.kt +++ b/src/test/java/com/worksap/nlp/sudachi/morphemes.kt @@ -16,12 +16,28 @@ package com.worksap.nlp.sudachi +import com.worksap.nlp.sudachi.dictionary.CharacterCategory import com.worksap.nlp.sudachi.dictionary.DictionaryAccess import com.worksap.nlp.sudachi.dictionary.POS import com.worksap.nlp.sudachi.dictionary.WordInfo +import java.net.URL + +fun DictionaryAccess.setCharacterCategory( + url: URL = javaClass.getResource("char.def") +): DictionaryAccess { + val resource = Config.Resource.Classpath(url) + this.grammar.setCharacterCategory(CharacterCategory.load(resource)) + return this +} fun DictionaryAccess.morpheme(id: Int): Morpheme { val node = LatticeNodeImpl(lexicon, 0, id) + node.setRange(0, node.getWordInfo().getLength().toInt()) + + // UTF8InputTextBuilder requires charcat + if (grammar.getCharacterCategory() == null) { + setCharacterCategory() + } val l = MorphemeList( diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/char.def b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/char.def new file mode 100644 index 00000000..b9728b79 --- /dev/null +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/char.def @@ -0,0 +1,167 @@ +# +# Japanese charcter category map +# +# $Id: char.def 9 2012-12-12 04:13:15Z togiso $; +# + +################################################################################### +# +# CHARACTER CATEGORY DEFINITION +# +# CATEGORY_NAME INVOKE GROUP LENGTH +# +# - CATEGORY_NAME: Name of category. you have to define DEFAULT class. +# - INVOKE: 1/0: always invoke unknown word processing, evan when the word can be found in the lexicon +# - GROUP: 1/0: make a new word by grouping the same chracter category +# - LENGTH: n: 1 to n length new words are added +# +DEFAULT 0 1 0 # DEFAULT is a mandatory category! +SPACE 0 1 0 +KANJI 0 0 2 +SYMBOL 1 1 0 +NUMERIC 1 1 0 +ALPHA 1 1 0 +HIRAGANA 0 1 2 +KATAKANA 1 1 2 +KANJINUMERIC 0 1 0 #change INVOKE 1->0 +GREEK 1 1 0 +CYRILLIC 1 1 0 + +################################################################################### +# +# CODE(UCS2) TO CATEGORY MAPPING +# + +# SPACE +0x0020 SPACE # DO NOT REMOVE THIS LINE, 0x0020 is reserved for SPACE +0x000D SPACE +0x0009 SPACE +0x000B SPACE +0x000A SPACE + +# ASCII +0x0021..0x002F SYMBOL #!"#$%&'()*+,-./ +0x0030..0x0039 NUMERIC #0-9 +0x003A..0x0040 SYMBOL #:;<=>?@ +0x0041..0x005A ALPHA #A-Z +0x005B..0x0060 SYMBOL #[\]^_` +0x0061..0x007A ALPHA #a-z +0x007B..0x007E SYMBOL #{|}~ + +# Latin +0x00A1..0x00BF SYMBOL # Latin 1 #¡->¿ +0x00C0..0x00D6 ALPHA # Latin 1 #À->Ö +0x00D7 SYMBOL # Latin 1 #× +0x00D8..0x00F6 ALPHA # Latin 1 #Ø->ö +0x00F7 SYMBOL # Latin 1 #÷ +0x00F8..0x00FF ALPHA # Latin 1 #ø->ÿ +0x0100..0x017F ALPHA # Latin Extended A +0x0180..0x0236 ALPHA # Latin Extended B +0x1E00..0x1EF9 ALPHA # Latin Extended Additional + +# CYRILLIC +0x0400..0x04F9 CYRILLIC #Ѐ->ӹ +0x0500..0x050F CYRILLIC # Cyrillic supplementary + +# GREEK +0x0374..0x03FB GREEK # Greek and Coptic #ʹ->ϻ + +# HIRAGANA +0x3041..0x309F HIRAGANA + +# KATAKANA +#0x30A1..0x30FF KATAKANA +0x30A1..0x30FA KATAKANA +0x30FC..0x30FF KATAKANA +0x31F0..0x31FF KATAKANA # Small KU .. Small RO +# 0x30FC KATAKANA HIRAGANA # ー +0x30A1 NOOOVBOW # Small A +0x30A3 NOOOVBOW +0x30A5 NOOOVBOW +0x30A7 NOOOVBOW +0x30A9 NOOOVBOW +0x30E3 NOOOVBOW +0x30E5 NOOOVBOW +0x30E7 NOOOVBOW +0x30EE NOOOVBOW +0x30FB..0x30FE NOOOVBOW + +# Half KATAKANA +0xFF66..0xFF9D KATAKANA +0xFF9E..0xFF9F KATAKANA + +# KANJI +0x2E80..0x2EF3 KANJI # CJK Raidcals Supplement +0x2F00..0x2FD5 KANJI +0x3005 KANJI NOOOVBOW +0x3007 KANJI +0x3400..0x4DB5 KANJI # CJK Unified Ideographs Extention +#0x4E00..0x9FA5 KANJI +0x4E00..0x9FFF KANJI +0xF900..0xFA2D KANJI +0xFA30..0xFA6A KANJI + + +# KANJI-NUMERIC (一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆) +0x4E00 KANJINUMERIC KANJI +0x4E8C KANJINUMERIC KANJI +0x4E09 KANJINUMERIC KANJI +0x56DB KANJINUMERIC KANJI +0x4E94 KANJINUMERIC KANJI +0x516D KANJINUMERIC KANJI +0x4E03 KANJINUMERIC KANJI +0x516B KANJINUMERIC KANJI +0x4E5D KANJINUMERIC KANJI +0x5341 KANJINUMERIC KANJI +0x767E KANJINUMERIC KANJI +0x5343 KANJINUMERIC KANJI +0x4E07 KANJINUMERIC KANJI +0x5104 KANJINUMERIC KANJI +0x5146 KANJINUMERIC KANJI + +# ZENKAKU +0xFF10..0xFF19 NUMERIC +0xFF21..0xFF3A ALPHA +0xFF41..0xFF5A ALPHA +0xFF01..0xFF0F SYMBOL #!->/ +0xFF1A..0xFF20 SYMBOL #:->@ +0xFF3B..0xFF40 SYMBOL #[->` +0xFF5B..0xFF65 SYMBOL #{->・ +0xFFE0..0xFFEF SYMBOL # HalfWidth and Full width Form + +# OTHER SYMBOLS +0x2000..0x206F SYMBOL # General Punctuation +0x2070..0x209F NUMERIC # Superscripts and Subscripts +0x20A0..0x20CF SYMBOL # Currency Symbols +0x20D0..0x20FF SYMBOL # Combining Diaritical Marks for Symbols +0x2100..0x214F SYMBOL # Letterlike Symbols +0x2150..0x218F NUMERIC # Number forms +0x2100..0x214B SYMBOL # Letterlike Symbols +0x2190..0x21FF SYMBOL # Arrow +0x2200..0x22FF SYMBOL # Mathematical Operators +0x2300..0x23FF SYMBOL # Miscellaneuos Technical +0x2460..0x24FF SYMBOL # Enclosed NUMERICs +0x2501..0x257F SYMBOL # Box Drawing +0x2580..0x259F SYMBOL # Block Elements +0x25A0..0x25FF SYMBOL # Geometric Shapes +0x2600..0x26FE SYMBOL # Miscellaneous Symbols +0x2700..0x27BF SYMBOL # Dingbats +0x27F0..0x27FF SYMBOL # Supplemental Arrows A +0x27C0..0x27EF SYMBOL # Miscellaneous Mathematical Symbols-A +0x2800..0x28FF SYMBOL # Braille Patterns +0x2900..0x297F SYMBOL # Supplemental Arrows B +0x2B00..0x2BFF SYMBOL # Miscellaneous Symbols and Arrows +0x2A00..0x2AFF SYMBOL # Supplemental Mathematical Operators +0x3300..0x33FF SYMBOL +0x3200..0x32FE SYMBOL # ENclosed CJK Letters and Months +0x3000..0x303F SYMBOL # CJK Symbol and Punctuation +0xFE30..0xFE4F SYMBOL # CJK Compatibility Forms +0xFE50..0xFE6B SYMBOL # Small Form Variants + +# added 2006/3/13 +0x3007 SYMBOL KANJINUMERIC + +# added 2018/11/30 +0x309b..0x309c HIRAGANA KATAKANA # voiced/semi-voiced sound marks + +# END OF TABLE From b843023fb4701c431ec4d6d29e976a71223f6d69 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 31 Jul 2024 10:10:44 +0900 Subject: [PATCH 49/94] fix dictionary printer --- .../dictionary/DictionaryHeaderPrinter.java | 11 + .../sudachi/dictionary/DictionaryPrinter.java | 216 +++++++++++------- .../dictionary/build/RawLexiconReader.java | 2 +- .../dictionary/DictionaryHeaderPrinterTest.kt | 90 ++++++++ .../dictionary/DictionaryPrinterTest.kt | 92 ++++++++ ...=> LegacyDictionaryHeaderPrinterTest.java} | 6 +- ....java => LegacyDictionaryPrinterTest.java} | 7 +- 7 files changed, 336 insertions(+), 88 deletions(-) create mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinterTest.kt create mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt rename src/test/java/com/worksap/nlp/sudachi/dictionary/{DictionaryHeaderPrinterTest.java => LegacyDictionaryHeaderPrinterTest.java} (97%) rename src/test/java/com/worksap/nlp/sudachi/dictionary/{DictionaryPrinterTest.java => LegacyDictionaryPrinterTest.java} (97%) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinter.java index 25c95a8a..d5c30df7 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinter.java @@ -35,6 +35,8 @@ private DictionaryHeaderPrinter() { /** print information in the dictionary Description part */ static void printDescription(String filename, PrintStream output) throws IOException { + output.printf("File: %s%n", filename); + ByteBuffer bytes; try (FileInputStream input = new FileInputStream(filename); FileChannel inputFile = input.getChannel()) { bytes = inputFile.map(FileChannel.MapMode.READ_ONLY, 0, inputFile.size()); @@ -42,6 +44,15 @@ static void printDescription(String filename, PrintStream output) throws IOExcep } Description desc = Description.load(bytes); + if (desc.isSystemDictionary()) { + output.println("type: system dictionary"); + } else if (desc.isUserDictionary()) { + output.println("type: user dictionary"); + } else { + // should not happen + output.println("invalid file"); + return; + } output.printf("Creation time: %s%n", desc.getCreationTime()); output.printf("Comment: %s%n", desc.getComment()); output.printf("Signature: %s%n", desc.getSignature()); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index 58c4347f..e8c8f6b9 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -16,6 +16,7 @@ package com.worksap.nlp.sudachi.dictionary; +import com.worksap.nlp.sudachi.WordId; import com.worksap.nlp.sudachi.dictionary.build.RawLexiconReader.Column; import java.io.IOException; @@ -27,13 +28,19 @@ import java.util.stream.Collectors; public class DictionaryPrinter { + public final char WordRefDelimiter = '/'; + public final String WordRefDelimiterStr = String.valueOf(WordRefDelimiter); + public final char WordRefJoiner = ','; + public final String WordRefJoinerStr = String.valueOf(WordRefJoiner); private final PrintStream output; + private final boolean isUser; private final BinaryDictionary dic; private final BinaryDictionary base; private final GrammarImpl grammar; - private final DoubleArrayLexicon lex; + private final LexiconSet lex; + // sorted raw word ids taken from the target dict. private final Ints wordIds; private DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictionary base) { @@ -41,22 +48,25 @@ private DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictio this.dic = dic; this.base = base; - if (base != null) { - GrammarImpl grammar = base.getGrammar(); - grammar.addPosList(dic.getGrammar()); - this.grammar = grammar; - } else { + if (base == null) { + isUser = false; grammar = dic.getGrammar(); - } + lex = new LexiconSet(dic.getLexicon(), grammar.getSystemPartOfSpeechSize()); + } else { + isUser = true; + grammar = base.getGrammar(); + lex = new LexiconSet(base.getLexicon(), grammar.getSystemPartOfSpeechSize()); - lex = dic.getLexicon(); + lex.add(dic.getLexicon(), (short) grammar.getPartOfSpeechSize()); + grammar.addPosList(dic.getGrammar()); + } // in order to output dictionary entries in in-dictionary order we need to sort - // them - // iterator over them will get them not in the sorted order, but grouped by - // surface (and sorted in groups) - Ints allIds = new Ints(lex.size()); - Iterator ids = lex.wordIds(0); + // them. iterator over them will get them not in the sorted order, but grouped + // by surface (and sorted in groups). + DoubleArrayLexicon targetLex = dic.getLexicon(); + Ints allIds = new Ints(targetLex.size()); + Iterator ids = targetLex.wordIds(0); while (ids.hasNext()) { allIds.appendAll(ids.next()); } @@ -67,8 +77,8 @@ private DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictio void printHeader() { // @formatter:off printColumnHeaders(Column.Surface, Column.LeftId, Column.RightId, Column.Cost, Column.Pos1, Column.Pos2, - Column.Pos3, Column.Pos4, Column.Pos5, Column.Pos6, Column.ReadingForm, Column.DictionaryForm, - Column.NormalizedForm, Column.Mode, Column.SplitA, Column.SplitB, Column.SplitC, Column.WordStructure, + Column.Pos3, Column.Pos4, Column.Pos5, Column.Pos6, Column.ReadingForm, Column.NormalizedForm, + Column.DictionaryForm, Column.SplitA, Column.SplitB, Column.SplitC, Column.WordStructure, Column.SynonymGroups, Column.UserData); // @formatter:on } @@ -79,23 +89,29 @@ void printColumnHeaders(Column... headers) { if (isFirst) { isFirst = false; } else { - output.print(","); + output.print(','); } output.print(c.name()); } output.println(); } + private void printEntries() { + // id of the target dic in LexiconSet + for (int i = 0; i < wordIds.length(); ++i) { + printEntry(wordIds.get(i)); + } + } + void printEntry(int wordId) { + int dic = WordId.dic(wordId); WordInfo info = lex.getWordInfo(wordId); POS pos = grammar.getPartOfSpeechString(info.getPOSId()); long params = lex.parameters(wordId); short leftId = WordParameters.leftId(params); short rightId = WordParameters.rightId(params); short cost = WordParameters.cost(params); - String surface = lex.string(0, info.getSurface()); - String reading = lex.string(0, info.getReadingForm()); - field(surface); + field(lex.string(dic, info.getSurface())); field(leftId); field(rightId); field(cost); @@ -105,104 +121,113 @@ void printEntry(int wordId) { field(pos.get(3)); field(pos.get(4)); field(pos.get(5)); - field(reading); - entryPtr(info.getNormalizedForm(), ","); - entryPtr(info.getDictionaryForm(), ","); - // TODO: - field(""); // mode - field(""); // C split - field(""); // B split - field(""); // A split - field(""); // Word structure - field(""); // sysnonym groups - field(""); // user data + field(lex.string(dic, info.getReadingForm())); + field(wordRef(info.getNormalizedForm(), wordId)); + field(wordRef(info.getDictionaryForm(), wordId)); + field(wordRefList(info.getAunitSplit())); + field(wordRefList(info.getBunitSplit())); + field(wordRefList(info.getCunitSplit())); + field(wordRefList(info.getWordStructure())); + field(intList(info.getSynonymGroupIds())); // synonym groups + lastField(info.getUserData()); // user data output.print("\n"); } - void entryPtr(int wordId, String delimiter) { - WordInfo info = lex.getWordInfo(wordId); - POS pos = grammar.getPartOfSpeechString(info.getPOSId()); - String surface = lex.string(0, info.getSurface()); - String reading = lex.string(0, info.getReadingForm()); - ptrPart(surface, "-"); - ptrPart(pos.get(0), "-"); - ptrPart(pos.get(1), "-"); - ptrPart(pos.get(2), "-"); - ptrPart(pos.get(3), "-"); - ptrPart(pos.get(4), "-"); - ptrPart(pos.get(5), "-"); - ptrPart(reading, ""); - output.print(delimiter); - } - - void ptrPart(String part, String delimiter) { - output.print(part); - output.print(delimiter); - } - void field(short value) { output.print(value); output.print(','); } void field(String value) { - output.print(maybeQuoteField(value)); + output.print(maybeEscapeString(value)); output.print(','); } - private String maybeQuoteField(String value) { - boolean hasCommas = value.indexOf(',') != -1; - boolean hasQuotes = value.indexOf('"') != -1; - if (hasCommas || hasQuotes) { - return escape(value, hasQuotes); - } - return value; + void lastField(String value) { + output.print(maybeEscapeString(value)); } - private String maybeQuoteRefPart(String value) { - if (value.indexOf(',') != -1 || value.indexOf('"') != -1 || value.indexOf('-') != -1 - || value.indexOf('/') != -1) { - return fullEscape(value); + /** + * encode word entry pointed by the wordId as WordRef.Triple. If it points to + * self, return empty string. + */ + String wordRef(int wordId, int reference) { + if (wordId == reference) { + return ""; } - return value; + return wordRef(wordId); + } + + /** encode word entry pointed by the wordId as WordRef.Triple. */ + String wordRef(int wordId) { + WordInfo info = lex.getWordInfo(wordId); + POS pos = grammar.getPartOfSpeechString(info.getPOSId()); + int dic = WordId.dic(wordId); + String surface = lex.string(dic, info.getSurface()); + String reading = lex.string(dic, info.getReadingForm()); + + List parts = new ArrayList<>(1 + POS.DEPTH + 1); + parts.add(surface); + parts.addAll(pos); + parts.add(reading); + + // escape special chars + String wordRefTriple = String.join(WordRefJoinerStr, + parts.stream().map(p -> maybeEscapeRefPart(p)).collect(Collectors.toList())); + return wordRefTriple; + } + + String wordRefList(int[] wordIds) { + return String.join(WordRefDelimiterStr, + Arrays.stream(wordIds).boxed().map(wi -> wordRef(wi)).collect(Collectors.toList())); + } + + String intList(int[] ints) { + return String.join("/", Arrays.stream(ints).boxed().map(i -> i.toString()).collect(Collectors.toList())); + } + + private static boolean hasCh(String value, int ch) { + return value.indexOf(ch) != -1; } - private String escape(String value, boolean hasQuotes) { + /** escape string field of csv. */ + private String maybeEscapeString(String value) { + boolean hasCommas = hasCh(value, ','); + boolean hasQuotes = hasCh(value, '"'); + if (!hasCommas && !hasQuotes) { + return value; + } if (hasQuotes) { - return fullEscape(value); + return "\"" + unicodeEscape(value, Arrays.asList('"')) + "\""; } - // only commas return "\"" + value + "\""; } - private String fullEscape(String value) { + /** escape WordRef.Triple part. */ + private String maybeEscapeRefPart(String value) { + boolean hasDelimiter = hasCh(value, WordRefDelimiter); + boolean hasJoiner = hasCh(value, WordRefJoiner); + if (!hasDelimiter && !hasJoiner) { + return value; + } + return unicodeEscape(value, Arrays.asList(WordRefDelimiter, WordRefJoiner)); + } + + /** escape specified chars as unicode codepoint */ + private String unicodeEscape(String value, List targetChars) { StringBuilder sb = new StringBuilder(value.length() + 10); int len = value.length(); for (int i = 0; i < len; ++i) { char c = value.charAt(i); - if (c != '"' && c != '-' && c != ',' && c != '/') { - sb.append(c); - } else { + if (targetChars.contains(c)) { sb.append("\\u{").append(Integer.toHexString(c)).append('}'); + } else { + sb.append(c); } } return sb.toString(); } - private void printEntries() { - for (int i = 0; i < wordIds.length(); ++i) { - printEntry(wordIds.get(i)); - } - } - - static void printDictionary(String filename, BinaryDictionary systemDict, PrintStream output) throws IOException { - try (BinaryDictionary dictionary = new BinaryDictionary(filename)) { - DictionaryPrinter dp = new DictionaryPrinter(output, dictionary, systemDict); - dp.printHeader(); - dp.printEntries(); - } - } - static char getUnitType(WordInfo info) { if (info.getAunitSplit().length == 0) { return 'A'; @@ -223,6 +248,27 @@ static String splitToString(int[] split) { } } + static void printDictionary(String filename, BinaryDictionary systemDict, PrintStream output) throws IOException { + try (BinaryDictionary dictionary = new BinaryDictionary(filename)) { + DictionaryPrinter dp; + if (dictionary.getDictionaryHeader().isUserDictionary()) { + if (systemDict == null) { + throw new IllegalArgumentException( + "System dictionary (`-s` option) is required to print user dictionary: " + filename); + } + dp = new DictionaryPrinter(output, dictionary, systemDict); + } else if (dictionary.getDictionaryHeader().isSystemDictionary()) { + dp = new DictionaryPrinter(output, dictionary, null); + } else { + // should not happen + throw new IllegalStateException("Invalid dictionary"); + } + + dp.printHeader(); + dp.printEntries(); + } + } + /** * Prints the contents of dictionary. * diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index 5059d632..1c33e23f 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -39,7 +39,7 @@ public class RawLexiconReader { public enum Column { Surface(true), LeftId(true), RightId(true), Cost(true), Writing(false), Pos1(true), Pos2(true), Pos3( true), Pos4(true), Pos5(true), Pos6(true), ReadingForm(true), NormalizedForm(true), DictionaryForm( - true), Mode(true), SplitA(true), SplitB( + true), Mode(false), SplitA(true), SplitB( true), WordStructure(true), SynonymGroups(false), SplitC(false), UserData(false); private final boolean required; diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinterTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinterTest.kt new file mode 100644 index 00000000..54e3cc1b --- /dev/null +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinterTest.kt @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2024 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary + +import com.worksap.nlp.sudachi.TestDictionary +import com.worksap.nlp.sudachi.Utils +import java.io.ByteArrayOutputStream +import java.io.PrintStream +import java.lang.IllegalArgumentException +import java.nio.file.Path +import kotlin.io.path.createTempDirectory +import kotlin.test.BeforeTest +import kotlin.test.Test +import kotlin.test.assertTrue + +class DictionaryHeaderPrinterTest { + lateinit var tempDir: Path + + @BeforeTest + fun setup() { + tempDir = createTempDirectory() + TestDictionary.systemDictData.writeData(tempDir.resolve("system.dic")) + TestDictionary.userDict1Data.writeData(tempDir.resolve("user.dic")) + Utils.copyResource(tempDir, "/unk.def") + } + + @Test + fun printSystemHeader() { + val filename = tempDir.resolve("system.dic").toString() + val output = ByteArrayOutputStream() + val ps = PrintStream(output) + DictionaryHeaderPrinter.printDescription(filename, ps) + val lines = output.toString().split(System.lineSeparator()) + + assertTrue(lines[0].contains(filename)) + assertTrue(lines[1].contains("system")) + assertTrue(lines[2].startsWith("Creation time: ")) + assertTrue(lines[3].equals("Comment: the system dictionary for the unit tests")) + assertTrue(lines[4].startsWith("Signature: ")) + assertTrue(lines[5].equals("Reference: ")) + } + + @Test + fun printUserHeader() { + val filename = tempDir.resolve("user.dic").toString() + val output = ByteArrayOutputStream() + val ps = PrintStream(output) + DictionaryHeaderPrinter.printDescription(filename, ps) + val lines = output.toString().split(System.lineSeparator()) + + assertTrue(lines[0].contains(filename)) + assertTrue(lines[1].contains("user")) + assertTrue(lines[2].startsWith("Creation time: ")) + assertTrue(lines[3].equals("Comment: ")) + assertTrue(lines[4].startsWith("Signature: ")) + assertTrue(lines[5].startsWith("Reference: ")) + } + + @Test + fun failToPrintInvalidHeader() { + val filename = tempDir.resolve("unk.def").toString() + val output = ByteArrayOutputStream() + val ps = PrintStream(output) + + var exceptionThrown = false + try { + DictionaryHeaderPrinter.printDescription(filename, ps) + } catch (e: IllegalArgumentException) { + exceptionThrown = true + } + val lines = output.toString().split(System.lineSeparator()) + + assertTrue(lines[0].contains(filename)) + assertTrue(exceptionThrown) + } +} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt new file mode 100644 index 00000000..ac8b9c68 --- /dev/null +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2024 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary + +import com.worksap.nlp.sudachi.TestDictionary +import com.worksap.nlp.sudachi.Utils +import java.io.ByteArrayOutputStream +import java.io.PrintStream +import java.nio.file.Path +import kotlin.io.path.createTempDirectory +import kotlin.test.BeforeTest +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertFails + +class DictionaryPrinterTest { + lateinit var tempDir: Path + + @BeforeTest + fun setup() { + tempDir = createTempDirectory() + TestDictionary.systemDictData.writeData(tempDir.resolve("system.dic")) + TestDictionary.userDict1Data.writeData(tempDir.resolve("user.dic")) + Utils.copyResource(tempDir, "/unk.def") + } + + @Test + fun printSystemDict() { + val filename = tempDir.resolve("system.dic").toString() + val output = ByteArrayOutputStream() + val ps = PrintStream(output) + DictionaryPrinter.printDictionary(filename, null, ps) + val lines = output.toString().split(System.lineSeparator()) + + assertEquals(41, lines.size) // header + entries + trailing new line + assertEquals( + "Surface,LeftId,RightId,Cost,Pos1,Pos2,Pos3,Pos4,Pos5,Pos6,ReadingForm,NormalizedForm,DictionaryForm,SplitA,SplitB,SplitC,WordStructure,SynonymGroups,UserData", + lines[0]) + assertEquals("た,1,1,8729,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,,,,,,,,", lines[1]) + assertEquals("に,2,2,11406,助詞,接続助詞,*,*,*,*,ニ,,,,,,,,", lines[2]) + } + + @Test + fun printUserDict() { + val filename = tempDir.resolve("user.dic").toString() + val output = ByteArrayOutputStream() + val ps = PrintStream(output) + DictionaryPrinter.printDictionary(filename, TestDictionary.systemDict, ps) + val lines = output.toString().split(System.lineSeparator()) + + assertEquals(6, lines.size) // header + entries + trailing new line + assertEquals( + "Surface,LeftId,RightId,Cost,Pos1,Pos2,Pos3,Pos4,Pos5,Pos6,ReadingForm,NormalizedForm,DictionaryForm,SplitA,SplitB,SplitC,WordStructure,SynonymGroups,UserData", + lines[0]) + assertEquals( + "東京府,6,6,2816,名詞,固有名詞,地名,一般,*,*,トウキョウフ,,,\"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/府,名詞,普通名詞,一般,*,*,*,フ\",,,\"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/府,名詞,普通名詞,一般,*,*,*,フ\",1/3,", + lines[3]) + assertEquals("すだち,6,6,2816,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,,,,,,,,", lines[4]) + } + + @Test + fun printUserDictWithoutSystem() { + val filename = tempDir.resolve("user.dic").toString() + val output = ByteArrayOutputStream() + val ps = PrintStream(output) + + assertFails { DictionaryPrinter.printDictionary(filename, null, ps) } + } + + @Test + fun failToPrintInvalidFile() { + val filename = tempDir.resolve("unk.def").toString() + val output = ByteArrayOutputStream() + val ps = PrintStream(output) + + assertFails { DictionaryPrinter.printDictionary(filename, TestDictionary.systemDict, ps) } + } +} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinterTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/LegacyDictionaryHeaderPrinterTest.java similarity index 97% rename from src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinterTest.java rename to src/test/java/com/worksap/nlp/sudachi/dictionary/LegacyDictionaryHeaderPrinterTest.java index d5db6b8a..b69f6bca 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinterTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/LegacyDictionaryHeaderPrinterTest.java @@ -29,11 +29,12 @@ import com.worksap.nlp.sudachi.Utils; import org.junit.Before; +import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; -public class DictionaryHeaderPrinterTest { +public class LegacyDictionaryHeaderPrinterTest { @Rule public TemporaryFolder temporaryFolder = new TemporaryFolder(); @@ -46,6 +47,7 @@ public void setUp() throws IOException { } @Test + @Ignore public void printHeaderWithSystemDict() throws IOException { File inputFile = new File(temporaryFolder.getRoot(), "system.dic"); String[] actuals; @@ -61,6 +63,7 @@ public void printHeaderWithSystemDict() throws IOException { } @Test + @Ignore public void printHeaderWithUserDict() throws IOException { File inputFile = new File(temporaryFolder.getRoot(), "user.dic"); String[] actuals; @@ -76,6 +79,7 @@ public void printHeaderWithUserDict() throws IOException { } @Test + @Ignore public void printHeaderWithInvalidFile() throws IOException { File inputFile = new File(temporaryFolder.getRoot(), "unk.def"); String[] actuals; diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/LegacyDictionaryPrinterTest.java similarity index 97% rename from src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.java rename to src/test/java/com/worksap/nlp/sudachi/dictionary/LegacyDictionaryPrinterTest.java index 9d8b1e79..0c2d618e 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/LegacyDictionaryPrinterTest.java @@ -29,11 +29,12 @@ import com.worksap.nlp.sudachi.Utils; import org.junit.Before; +import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; -public class DictionaryPrinterTest { +public class LegacyDictionaryPrinterTest { @Rule public TemporaryFolder temporaryFolder = new TemporaryFolder(); @@ -48,6 +49,7 @@ public void setUp() throws IOException { } @Test + @Ignore public void printWithSystemDict() throws IOException { File inputFile = new File(temporaryFolder.getRoot(), "system.dic"); String[] actuals; @@ -60,6 +62,7 @@ public void printWithSystemDict() throws IOException { } @Test + @Ignore public void printWithUserDict() throws IOException { File inputFile = new File(temporaryFolder.getRoot(), "user.dic"); File systemDictFile = new File(temporaryFolder.getRoot(), "system.dic"); @@ -76,6 +79,7 @@ public void printWithUserDict() throws IOException { } @Test(expected = IllegalArgumentException.class) + @Ignore public void printWithUserDictWithoutGrammar() throws IOException { File inputFile = new File(temporaryFolder.getRoot(), "user.dic"); try (ByteArrayOutputStream output = new ByteArrayOutputStream(); PrintStream ps = new PrintStream(output)) { @@ -84,6 +88,7 @@ public void printWithUserDictWithoutGrammar() throws IOException { } @Test(expected = IOException.class) + @Ignore public void readGrammarWithInvalidFile() throws IOException { File inputFile = new File(temporaryFolder.getRoot(), "unk.def"); BinaryDictionary.loadSystem(inputFile.getPath()); From a13fbabbf827ed5d19340080f6fd84589ae12e66 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 31 Jul 2024 10:15:27 +0900 Subject: [PATCH 50/94] rebuild printed dict test (and fix doc) --- .../com/worksap/nlp/sudachi/WordMask.java | 4 +- .../sudachi/dictionary/DictionaryPrinter.java | 11 +--- .../dictionary/build/RawWordEntry.java | 2 +- .../dictionary/DictionaryPrinterTest.kt | 54 +++++++++++++++++++ 4 files changed, 59 insertions(+), 12 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/WordMask.java b/src/main/java/com/worksap/nlp/sudachi/WordMask.java index bc835afd..2828c86e 100644 --- a/src/main/java/com/worksap/nlp/sudachi/WordMask.java +++ b/src/main/java/com/worksap/nlp/sudachi/WordMask.java @@ -38,8 +38,8 @@ public static long addNth(long positions, int position) { } /** - * Create a word mask with nth position set. If position > 64, set the highest - * bit instead. + * Create a word mask with nth position set. If position larger than 65, set the + * highest bit instead. * * @param position * number of set position diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index e8c8f6b9..c9ca4047 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -34,9 +34,6 @@ public class DictionaryPrinter { public final String WordRefJoinerStr = String.valueOf(WordRefJoiner); private final PrintStream output; - private final boolean isUser; - private final BinaryDictionary dic; - private final BinaryDictionary base; private final GrammarImpl grammar; private final LexiconSet lex; @@ -45,15 +42,11 @@ public class DictionaryPrinter { private DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictionary base) { this.output = output; - this.dic = dic; - this.base = base; if (base == null) { - isUser = false; grammar = dic.getGrammar(); lex = new LexiconSet(dic.getLexicon(), grammar.getSystemPartOfSpeechSize()); } else { - isUser = true; grammar = base.getGrammar(); lex = new LexiconSet(base.getLexicon(), grammar.getSystemPartOfSpeechSize()); @@ -128,8 +121,8 @@ void printEntry(int wordId) { field(wordRefList(info.getBunitSplit())); field(wordRefList(info.getCunitSplit())); field(wordRefList(info.getWordStructure())); - field(intList(info.getSynonymGroupIds())); // synonym groups - lastField(info.getUserData()); // user data + field(intList(info.getSynonymGroupIds())); + lastField(info.getUserData()); output.print("\n"); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java index 0822d289..d2bb68c5 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java @@ -52,7 +52,7 @@ public class RawWordEntry implements Lookup2.Entry { * Compute expected size of word entry when put in the binary dictionary. This * function additionally validates length of split entries. * - * @return expected binary size of this entry, in bytes, will be always >=32 + * @return expected binary size of this entry, in bytes (minimum 32). */ public int computeExpectedSize() { int size = 32; diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt index ac8b9c68..0f43d753 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt @@ -18,12 +18,18 @@ package com.worksap.nlp.sudachi.dictionary import com.worksap.nlp.sudachi.TestDictionary import com.worksap.nlp.sudachi.Utils +import com.worksap.nlp.sudachi.dictionary.build.DicBuilder +import com.worksap.nlp.sudachi.dictionary.build.MemChannel +import com.worksap.nlp.sudachi.res import java.io.ByteArrayOutputStream +import java.io.FileOutputStream import java.io.PrintStream +import java.nio.file.Files import java.nio.file.Path import kotlin.io.path.createTempDirectory import kotlin.test.BeforeTest import kotlin.test.Test +import kotlin.test.assertContentEquals import kotlin.test.assertEquals import kotlin.test.assertFails @@ -89,4 +95,52 @@ class DictionaryPrinterTest { assertFails { DictionaryPrinter.printDictionary(filename, TestDictionary.systemDict, ps) } } + + @Test + fun rebuildAndReprintSystem() { + val dicfile = tempDir.resolve("system.dic").toString() + + val lexfile = tempDir.resolve("system_lex.csv") + val output1 = FileOutputStream(lexfile.toFile()) + val ps1 = PrintStream(output1) + DictionaryPrinter.printDictionary(dicfile, null, ps1) + output1.close() + val printed = Files.readString(lexfile).split(System.lineSeparator()) + + val dicfile2 = tempDir.resolve("system.dic2") + val reload = MemChannel() + DicBuilder.system().matrix(res("/dict/matrix.def")).lexicon(lexfile).build(reload) + reload.writeData(dicfile2) + + val output2 = ByteArrayOutputStream() + val ps2 = PrintStream(output2) + DictionaryPrinter.printDictionary(dicfile2.toString(), null, ps2) + val reprinted = output2.toString().split(System.lineSeparator()) + + assertContentEquals(printed, reprinted) + } + + @Test + fun rebuildAndReprintUser() { + val dicfile = tempDir.resolve("user.dic").toString() + + val lexfile = tempDir.resolve("user_lex.csv") + val output1 = FileOutputStream(lexfile.toFile()) + val ps1 = PrintStream(output1) + DictionaryPrinter.printDictionary(dicfile, TestDictionary.systemDict, ps1) + output1.close() + val printed = Files.readString(lexfile).split(System.lineSeparator()) + + val dicfile2 = tempDir.resolve("user.dic2") + val reload = MemChannel() + DicBuilder.user().system(TestDictionary.systemDict).lexicon(lexfile).build(reload) + reload.writeData(dicfile2) + + val output2 = ByteArrayOutputStream() + val ps2 = PrintStream(output2) + DictionaryPrinter.printDictionary(dicfile2.toString(), TestDictionary.systemDict, ps2) + val reprinted = output2.toString().split(System.lineSeparator()) + + assertContentEquals(printed, reprinted) + } } From 01d7c2b64aa188bdf9070ce4b8b1fed09bb2996d Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Thu, 1 Aug 2024 13:17:17 +0900 Subject: [PATCH 51/94] show print progress and fix normalizedForm print/parse --- .../sudachi/dictionary/DictionaryPrinter.java | 21 +++++++++++++++++-- .../sudachi/dictionary/build/DicBuilder.java | 5 +++-- .../dictionary/build/RawLexiconReader.java | 13 +++++++++--- .../nlp/sudachi/dictionary/build/WordRef.java | 5 ++++- 4 files changed, 36 insertions(+), 8 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index c9ca4047..334eeccb 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -17,6 +17,8 @@ package com.worksap.nlp.sudachi.dictionary; import com.worksap.nlp.sudachi.WordId; +import com.worksap.nlp.sudachi.dictionary.build.Progress; +import com.worksap.nlp.sudachi.dictionary.DictionaryBuilder.StderrProgress; import com.worksap.nlp.sudachi.dictionary.build.RawLexiconReader.Column; import java.io.IOException; @@ -34,6 +36,7 @@ public class DictionaryPrinter { public final String WordRefJoinerStr = String.valueOf(WordRefJoiner); private final PrintStream output; + private final Progress progress = new Progress(20, new StderrProgress()); private final GrammarImpl grammar; private final LexiconSet lex; @@ -90,10 +93,13 @@ void printColumnHeaders(Column... headers) { } private void printEntries() { - // id of the target dic in LexiconSet + progress.startBlock("Entries", System.nanoTime(), Progress.Kind.ENTRY); + long size = wordIds.length(); for (int i = 0; i < wordIds.length(); ++i) { printEntry(wordIds.get(i)); + progress.progress(i, size); } + progress.endBlock(size, System.nanoTime()); } void printEntry(int wordId) { @@ -115,7 +121,7 @@ void printEntry(int wordId) { field(pos.get(4)); field(pos.get(5)); field(lex.string(dic, info.getReadingForm())); - field(wordRef(info.getNormalizedForm(), wordId)); + field(wordRefHeadword(info.getNormalizedForm(), wordId)); field(wordRef(info.getDictionaryForm(), wordId)); field(wordRefList(info.getAunitSplit())); field(wordRefList(info.getBunitSplit())); @@ -170,6 +176,17 @@ String wordRef(int wordId) { return wordRefTriple; } + /** encode word entry pointed by the wordId as WordRef.Headword. */ + String wordRefHeadword(int wordId, int reference) { + if (wordId == reference) { + return ""; + } + int dic = WordId.dic(wordId); + WordInfo info = lex.getWordInfo(wordId); + String surface = lex.string(dic, info.getSurface()); + return surface; + } + String wordRefList(int[] wordIds) { return String.join(WordRefDelimiterStr, Arrays.stream(wordIds).boxed().map(wi -> wordRef(wi)).collect(Collectors.toList())); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java index 1520ad7f..9d4869ae 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java @@ -71,14 +71,15 @@ private T self() { * when IO fails */ public T lexicon(String name, IOSupplier input, long size) throws IOException { - progress.startBlock(name, nanoTime(), Progress.Kind.BYTE); + int numEntryBefore = lexicon.getTotalEntries(); + progress.startBlock(name, nanoTime(), Progress.Kind.ENTRY); short numLeft = connection.nonEmpty() ? connection.getNumLeft() : Short.MAX_VALUE; short numRight = connection.nonEmpty() ? connection.getNumRight() : Short.MAX_VALUE; try (InputStream is = input.get()) { InputStream stream = new ProgressInputStream(is, size, progress); lexicon.read(name, stream, pos, numLeft, numRight); } - progress.endBlock(size, nanoTime()); + progress.endBlock(lexicon.getTotalEntries() - numEntryBefore, nanoTime()); return self(); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index 1c33e23f..3de326e0 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -63,10 +63,10 @@ public RawLexiconReader(CSVParser parser, POSTable pos, boolean user) throws IOE resolveColumnLayout(); if (isLegacyColumnLayout()) { normRefParser = WordRef.parser(pos, false, true, false); - dictRefParser = WordRef.parser(pos, true, true, true); + dictRefParser = WordRef.parser(pos, true, false, true); splitParser = WordRef.parser(pos, true, false, false); } else { - normRefParser = WordRef.parser(pos, false, false, false); + normRefParser = WordRef.parser(pos, false, true, false); dictRefParser = WordRef.parser(pos, !user, false, false); splitParser = WordRef.parser(pos, !user, false, false); } @@ -193,7 +193,14 @@ private RawWordEntry convertEntry(List data) { entry.cost = getShort(data, Column.Cost); entry.reading = get(data, Column.ReadingForm, true); - entry.normalizedForm = normRefParser.parse(get(data, Column.NormalizedForm, false)); + WordRef normalizedForm = normRefParser.parse(get(data, Column.NormalizedForm, false)); + if (normalizedForm instanceof WordRef.Headword + && ((WordRef.Headword) normalizedForm).getHeadword().equals(entry.headword)) { + // mark as self-reference (headword ref may point different entry) + entry.normalizedForm = null; + } else { + entry.normalizedForm = normalizedForm; + } entry.dictionaryForm = dictRefParser.parse(get(data, Column.DictionaryForm, false)); POS pos = new POS( diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java index ba093b00..9d83dfac 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java @@ -148,6 +148,9 @@ public String getReading() { @Override public int resolve(Lookup2 resolver) { List entries = resolver.byHeadword(headword); + if (entries == null) { + throw new IllegalArgumentException("matching entry not found for the " + this.toString()); + } for (Lookup2.EntryWithFlag entry : entries) { if (entry.matches(posId, reading)) { return intoWordRef(entry); @@ -221,7 +224,7 @@ public WordRef parse(String text) { } if (allowHeadword) { - return new Headword(text); + return new Headword(Unescape.unescape(text)); } else { throw new CsvFieldException( String.format("invalid word reference: %s, it must contain POS tag and reading", text)); From 875e0c9fa1bcae1b9f8239c92319149067b6fea7 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Thu, 1 Aug 2024 16:04:55 +0900 Subject: [PATCH 52/94] remove legacy dictionary codes --- .../sudachi/dictionary/DictionaryHeader.java | 124 ---------- .../dictionary/DictionaryHeaderPrinter.java | 26 --- .../sudachi/dictionary/WordParameterList.java | 82 ------- .../sudachi/dictionary/build/BlockLayout.java | 18 +- .../sudachi/dictionary/build/BlockOutput.java | 7 +- .../sudachi/dictionary/build/BuildStats.java | 37 --- .../dictionary/build/ConnectionMatrix.java | 7 +- .../sudachi/dictionary/build/CsvLexicon.java | 215 ------------------ .../sudachi/dictionary/build/DicBuffer.java | 125 ---------- .../sudachi/dictionary/build/DicBuilder.java | 1 + .../sudachi/dictionary/build/IOConsumer.java | 29 --- .../nlp/sudachi/dictionary/build/Index.java | 50 +--- .../sudachi/dictionary/build/ModelOutput.java | 149 ------------ .../sudachi/dictionary/build/POSTable.java | 20 +- .../sudachi/dictionary/build/Parameters.java | 84 ------- .../dictionary/build/RawLexiconReader.java | 4 + .../nlp/sudachi/dictionary/build/Stats.java | 32 --- .../dictionary/build/StringStorage.java | 60 ----- .../dictionary/build/TrackingInputStream.java | 62 ----- .../dictionary/build/WordIdResolver.java | 31 --- .../sudachi/dictionary/build/WordLookup.java | 27 --- .../dictionary/build/WriteDictionary.java | 23 -- .../dictionary/DictionaryHeaderTest.java | 54 ----- .../LegacyDictionaryHeaderPrinterTest.java | 94 -------- .../LegacyDictionaryPrinterTest.java | 96 -------- .../dictionary/build/CsvLexiconTest.kt | 122 ---------- .../sudachi/dictionary/build/DicBufferTest.kt | 97 -------- .../sudachi/dictionary/build/GrammarTest.kt | 39 +++- .../dictionary/build/InMemoryChannel.java | 0 .../dictionary/build/ParametersTest.kt | 41 ---- .../dictionary/build/RawLexiconReaderTest.kt | 137 ++++++++++- .../sudachi/dictionary/build/UnescapeTest.kt | 42 ++++ 32 files changed, 233 insertions(+), 1702 deletions(-) delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeader.java delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameterList.java delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/BuildStats.java delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuffer.java delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOConsumer.java delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/ModelOutput.java delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/Parameters.java delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/Stats.java delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/TrackingInputStream.java delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordIdResolver.java delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLookup.java delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/WriteDictionary.java delete mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderTest.java delete mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/LegacyDictionaryHeaderPrinterTest.java delete mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/LegacyDictionaryPrinterTest.java delete mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexiconTest.kt delete mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/build/DicBufferTest.kt rename src/{main => test}/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java (100%) delete mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/build/ParametersTest.kt create mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/build/UnescapeTest.kt diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeader.java deleted file mode 100644 index 6277e151..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeader.java +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2021 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary; - -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.charset.StandardCharsets; - -/** - * A header of a dictionary file. - */ -public class DictionaryHeader { - - private final long version; - private final long createTime; - private final String description; - - private static final int DESCRIPTION_SIZE = 256; - private static final int STORAGE_SIZE = 8 + 8 + DESCRIPTION_SIZE; - - public DictionaryHeader(long version, long createTime, String description) { - this.version = version; - this.createTime = createTime; - this.description = description; - } - - public DictionaryHeader(ByteBuffer input, int offset) { - version = input.getLong(offset); - offset += 8; - createTime = input.getLong(offset); - offset += 8; - byte[] byteDescription = new byte[DESCRIPTION_SIZE]; - int length; - for (length = 0; length < DESCRIPTION_SIZE; length++) { - byteDescription[length] = input.get(offset++); - if (byteDescription[length] == 0) { - break; - } - } - description = new String(byteDescription, 0, length, StandardCharsets.UTF_8); - } - - public int storageSize() { - return STORAGE_SIZE; - } - - public byte[] toByte() { - byte[] output = new byte[STORAGE_SIZE]; - ByteBuffer buffer = ByteBuffer.wrap(output); - buffer.order(ByteOrder.LITTLE_ENDIAN); - buffer.putLong(version); - buffer.putLong(createTime); - byte[] byteDescription = description.getBytes(StandardCharsets.UTF_8); - if (byteDescription.length > DESCRIPTION_SIZE) { - throw new IllegalArgumentException("description is too long"); - } - buffer.put(byteDescription); - - return output; - } - - /** - * Returns the version of the dictionary. - * - * The version is {@code DictionaryVersion#SYSTEM_DICT_VERSION_*} or - * {@code DictionaryVersion#USER_DICT_VERSION_*}. If the file is not a - * dictionary, returns an other value. - * - * @return the version of the dictionary - */ - public long getVersion() { - return version; - } - - /** - * Returns the epoch seconds at which the dictionary is created. - * - * @return the epoch seconds - */ - public long getCreateTime() { - return createTime; - } - - /** - * Returns the description of the dictionary which is specified at creating. - * - * @return the description of the dictionary - */ - public String getDescription() { - return description; - } - - /** - * Resturns {@code true} if, and only if, the file is a system dictionary. - * - * @return true if the file is a system dictionary, otherwise false - */ - public boolean isSystemDictionary() { - return DictionaryVersion.isSystemDictionary(version); - } - - /** - * Resturns {@code true} if, and only if, the file is a user dictionary. - * - * @return true if the file is a user dictionary, otherwise false - */ - public boolean isUserDictionary() { - return DictionaryVersion.isUserDictionary(version); - } -} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinter.java index d5c30df7..fc531bd4 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinter.java @@ -22,8 +22,6 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.channels.FileChannel; -import java.time.Instant; -import java.time.ZoneId; /** * A dictionary header printing tool. @@ -66,30 +64,6 @@ static void printDescription(String filename, PrintStream output) throws IOExcep output.printf("Flag isRuntimeCosts: %s%n", desc.isRuntimeCosts()); } - static void printHeader(String filename, PrintStream output) throws IOException { - ByteBuffer bytes; - try (FileInputStream input = new FileInputStream(filename); FileChannel inputFile = input.getChannel()) { - bytes = inputFile.map(FileChannel.MapMode.READ_ONLY, 0, inputFile.size()); - bytes.order(ByteOrder.LITTLE_ENDIAN); - } - DictionaryHeader header = new DictionaryHeader(bytes, 0); - - output.println("filename: " + filename); - - if (header.isSystemDictionary()) { - output.println("type: system dictionary"); - } else if (header.isUserDictionary()) { - output.println("type: user dictionary"); - } else { - output.println("invalid file"); - return; - } - - output.println("createTime: " - + Instant.ofEpochSecond(header.getCreateTime()).atZone(ZoneId.systemDefault()).toString()); - output.println("description: " + header.getDescription()); - } - /** * Prints the contents of dictionary header. * diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameterList.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameterList.java deleted file mode 100644 index 7039cef9..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameterList.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2021 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary; - -import java.nio.Buffer; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; - -class WordParameterList { - - private static final int ELEMENT_SIZE = 2 * 3; - - private ByteBuffer bytes; - private final int size; - private int offset; - private boolean isCopied; - - WordParameterList(ByteBuffer bytes, int offset) { - this.bytes = bytes; - size = bytes.getInt(offset); - this.offset = offset + 4; - isCopied = false; - } - - int storageSize() { - return 4 + ELEMENT_SIZE * size; - } - - int size() { - return size; - } - - short getLeftId(int wordId) { - return bytes.getShort(offset + ELEMENT_SIZE * wordId); - } - - short getRightId(int wordId) { - return bytes.getShort(offset + ELEMENT_SIZE * wordId + 2); - } - - short getCost(int wordId) { - return bytes.getShort(offset + ELEMENT_SIZE * wordId + 4); - } - - void setCost(int wordId, short cost) { - if (!isCopied) { - copyBuffer(); - } - bytes.putShort(offset + ELEMENT_SIZE * wordId + 4, cost); - } - - int endOffset() { - return offset + 4 + ELEMENT_SIZE * size; - } - - synchronized void copyBuffer() { - ByteBuffer newBuffer = ByteBuffer.allocate(ELEMENT_SIZE * size); - newBuffer.order(ByteOrder.LITTLE_ENDIAN); - ByteBuffer srcBuffer = bytes.duplicate(); - Buffer buffer = srcBuffer; // a kludge for Java 9 - buffer.position(offset); - buffer.limit(offset + ELEMENT_SIZE * size); - newBuffer.put(srcBuffer); - bytes = newBuffer; - offset = 0; - isCopied = true; - } -} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java index a91893bf..9d228e24 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java @@ -37,7 +37,11 @@ public class BlockLayout { public BlockLayout(SeekableByteChannel channel, Progress progress) throws IOException { this.channel = channel; this.progress = progress; - channel.position(BLOCK_SIZE); // keep first block for the description + } + + public BlockLayout(SeekableByteChannel channel) throws IOException { + this.channel = channel; + this.progress = Progress.NOOP; } /** @@ -53,6 +57,18 @@ private long alignPosition() throws IOException { return newPosition; } + /** + * Keep space for the specified number of blocks for the later use. + * + * @return start position of keeped blocks. + */ + public long keepBlocks(int numBlocks) throws IOException { + long blockSize = numBlocks * BLOCK_SIZE; + long startPosition = Align.align(channel.position(), BLOCK_SIZE); + channel.position(startPosition + blockSize); + return startPosition; + } + /** Function that works with BlockOutput */ public interface BlockHandler { T apply(BlockOutput output) throws IOException; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockOutput.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockOutput.java index 1c45b243..c5e66297 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockOutput.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockOutput.java @@ -26,13 +26,16 @@ public class BlockOutput { private SeekableByteChannel chan; private Progress progress; - private Stats stats; - public BlockOutput(SeekableByteChannel chan, Progress progress) { this.chan = chan; this.progress = progress; } + public BlockOutput(SeekableByteChannel chan) { + this.chan = chan; + this.progress = Progress.NOOP; + } + public SeekableByteChannel getChannel() { return chan; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BuildStats.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BuildStats.java deleted file mode 100644 index f1007527..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BuildStats.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2021 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build; - -import java.util.List; - -public class BuildStats { - private final List inputs; - private final List parts; - - public BuildStats(List inputs, List parts) { - this.inputs = inputs; - this.parts = parts; - } - - public List getInputs() { - return inputs; - } - - public List getParts() { - return parts; - } -} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java index da9d6122..ab557865 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java @@ -28,7 +28,7 @@ /** * Dictionary parts: left/right id connection cost matrix. */ -public class ConnectionMatrix implements WriteDictionary { +public class ConnectionMatrix { private short numLeft; private short numRight; private ByteBuffer compiled; @@ -139,11 +139,6 @@ public void makeEmpty() { compiled = data; } - @Override - public void writeTo(ModelOutput output) throws IOException { - output.write(compiled); - } - /** @return number of left id */ public short getNumLeft() { return numLeft; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java deleted file mode 100644 index daa4597c..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexicon.java +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Copyright (c) 2021 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build; - -import com.worksap.nlp.sudachi.StringUtil; -import com.worksap.nlp.sudachi.WordId; -import com.worksap.nlp.sudachi.dictionary.Ints; -import com.worksap.nlp.sudachi.dictionary.POS; -import com.worksap.nlp.sudachi.dictionary.WordInfo; -import com.worksap.nlp.sudachi.dictionary.build.RawLexiconReader.Column; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.regex.Pattern; - -public class CsvLexicon implements WriteDictionary { - static final int ARRAY_MAX_LENGTH = Byte.MAX_VALUE; - static final int MIN_REQUIRED_NUMBER_OF_COLUMNS = 18; - static final Pattern unicodeLiteral = Pattern.compile("\\\\u([0-9a-fA-F]{4}|\\{[0-9a-fA-F]+})"); - private static final Pattern PATTERN_ID = Pattern.compile("U?\\d+"); - private final Parameters parameters = new Parameters(); - private final POSTable posTable; - private final List entries = new ArrayList<>(); - private WordIdResolver widResolver = null; - - // temporal fix - private WordRef.Parser parser; - - public CsvLexicon(POSTable pos) { - posTable = pos; - parser = WordRef.parser(pos, false, true, false); - } - - public void setResolver(WordIdResolver widResolver) { - this.widResolver = widResolver; - } - - public List getEntries() { - return entries; - } - - RawWordEntry parseLine(List cols) { - if (cols.size() < MIN_REQUIRED_NUMBER_OF_COLUMNS) { - throw new IllegalArgumentException("invalid format"); - } - for (int i = 0; i < 15; i++) { - cols.set(i, Unescape.unescape(cols.get(i))); - } - - if (cols.get(0).getBytes(StandardCharsets.UTF_8).length > DicBuffer.MAX_STRING - || !DicBuffer.isValidLength(cols.get(4)) || !DicBuffer.isValidLength(cols.get(11)) - || !DicBuffer.isValidLength(cols.get(12))) { - throw new IllegalArgumentException("string is too long"); - } - - if (cols.get(0).isEmpty()) { - throw new IllegalArgumentException("headword is empty"); - } - - RawWordEntry entry = new RawWordEntry(); - - // headword for trie - if (!cols.get(1).equals("-1")) { - entry.headword = cols.get(0); - } - - // left-id, right-id, cost - short leftId = Short.parseShort(cols.get(1)); - short rightId = Short.parseShort(cols.get(2)); - short cost = Short.parseShort(cols.get(3)); - parameters.add(leftId, rightId, cost); - entry.leftId = leftId; - entry.rightId = rightId; - entry.cost = cost; - - // part of speech - POS pos = new POS(cols.get(5), cols.get(6), cols.get(7), cols.get(8), cols.get(9), cols.get(10)); - short posId = posTable.getId(pos); - - entry.aUnitSplit = parseWordRefs(cols.get(15)); - entry.bUnitSplit = parseWordRefs(cols.get(16)); - entry.wordStructure = parseWordRefs(cols.get(17)); - checkSplitInfoFormat(entry.aUnitSplit); - checkSplitInfoFormat(entry.bUnitSplit); - checkSplitInfoFormat(entry.wordStructure); - if (cols.get(14).equals("A") && (!entry.aUnitSplit.isEmpty() || !entry.bUnitSplit.isEmpty())) { - throw new IllegalArgumentException("invalid splitting"); - } - return entry; - } - - private List parseWordRefs(String value) { - if (value == null || value.isEmpty() || "*".equals(value)) { - return new ArrayList<>(); - } - String[] parts = value.split("/"); - if (parts.length > Byte.MAX_VALUE) { - throw new IllegalArgumentException("reference list contained more than 127 entries: " + value); - } - List result = new ArrayList<>(parts.length); - for (String part : parts) { - result.add(parser.parse(part)); - } - return result; - } - - int[] parseSynonymGids(String str) { - if (str.equals("*")) { - return new int[0]; - } - String[] ids = str.split("/"); - if (ids.length > ARRAY_MAX_LENGTH) { - throw new IllegalArgumentException("too many units"); - } - int[] ret = new int[ids.length]; - for (int i = 0; i < ids.length; i++) { - ret[i] = Integer.parseInt(ids[i]); - } - return ret; - } - - int wordToId(String text) { - String[] cols = text.split(",", 8); - if (cols.length < 8) { - throw new IllegalArgumentException("too few columns"); - } - String headword = Unescape.unescape(cols[0]); - POS pos = new POS(Arrays.copyOfRange(cols, 1, 7)); - short posId = posTable.getId(pos); - String reading = Unescape.unescape(cols[7]); - return widResolver.lookup(headword, posId, reading); - } - - void checkSplitInfoFormat(List info) { - if (info.size() > ARRAY_MAX_LENGTH) { - throw new IllegalArgumentException("too many units"); - } - } - - boolean isId(String text) { - return PATTERN_ID.matcher(text).matches(); - } - - int[] parseSplitInfo(String info) { - if (info.equals("*")) { - return new int[0]; - } - String[] words = info.split("/"); - if (words.length > ARRAY_MAX_LENGTH) { - throw new IllegalArgumentException("too many units"); - } - int[] ret = new int[words.length]; - for (int i = 0; i < words.length; i++) { - String ref = words[i]; - if (isId(ref)) { - ret[i] = parseId(ref); - } else { - ret[i] = wordToId(ref); - if (ret[i] < 0) { - throw new IllegalArgumentException("couldn't find " + ref + " in the dictionaries"); - } - } - } - return ret; - } - - int parseId(String text) { - int id = 0; - if (text.startsWith("U")) { - id = Integer.parseInt(text.substring(1)); - if (widResolver.isUser()) { - id = WordId.make(1, id); - } - } else { - id = Integer.parseInt(text); - } - widResolver.validate(id); - return id; - } - - @Override - public void writeTo(ModelOutput output) throws IOException { - - } - - public int addEntry(RawWordEntry e) { - int id = entries.size(); - entries.add(e); - return id; - } - - public void setLimits(int left, int right) { - parameters.setLimits(left, right); - } - -} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuffer.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuffer.java deleted file mode 100644 index fe5c3c35..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuffer.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2021 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; - -/** - * Buffers dictionary data for writing into channels. Wrapper over a ByteBuffer. - * - * @see java.nio.ByteBuffer - */ -public class DicBuffer { - public static final int MAX_STRING = Short.MAX_VALUE; - private final ByteBuffer buffer; - - public DicBuffer(int length, int number) { - this(length * number * 2 + number * 2); - } - - public DicBuffer(int size) { - buffer = ByteBuffer.allocate(size); - buffer.order(ByteOrder.LITTLE_ENDIAN); - } - - public static boolean isValidLength(String text) { - return text.length() <= MAX_STRING; - } - - /** - * Tries to put the string s into the buffer - * - * @param s - * the string to put into buffer - * @return true if successful, false when the buffer does not have enough size. - * The buffer is not modified in that case. - */ - public boolean put(String s) { - int length = s.length(); - if (!putLength(length)) { - return false; - } - s.chars().forEach(c -> buffer.putChar((char) c)); - return true; - } - - /** - * Tries to put the length of a string into the buffer - * - * @param length - * the length of the string to put into buffer - * @return true if successful, false when the buffer does not have enough size. - * The buffer is not modified in that case. - */ - public boolean putLength(int length) { - if (length >= MAX_STRING) { - throw new IllegalArgumentException("can't handle string with length >= " + MAX_STRING); - } - int addLen = (length > Byte.MAX_VALUE) ? 2 : 1; - if (wontFit(length * 2 + addLen)) { - return false; - } - if (length <= Byte.MAX_VALUE) { - buffer.put((byte) length); - } else { - buffer.put((byte) ((length >> 8) | 0x80)); - buffer.put((byte) (length & 0xFF)); - } - return true; - } - - public T consume(IOConsumer consumer) throws IOException { - buffer.flip(); - T result = consumer.accept(buffer); - buffer.clear(); - return result; - } - - public void putShort(short val) { - buffer.putShort(val); - } - - public void putInt(int val) { - buffer.putInt(val); - } - - public boolean wontFit(int space) { - return buffer.remaining() < space; - } - - public int position() { - return buffer.position(); - } - - public void putEmptyIfEqual(String field, String surface) { - if (field.equals(surface)) { - put(""); - } else { - put(field); - } - } - - public void putInts(int[] data) { - buffer.put((byte) data.length); - for (int v : data) { - buffer.putInt(v); - } - } - -} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java index 9d4869ae..addf0c5c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java @@ -164,6 +164,7 @@ public T compilationTime(Instant instant) { */ public void build(SeekableByteChannel channel) throws IOException { BlockLayout layout = new BlockLayout(channel, progress); + layout.keepBlocks(1); // keep space for the Description. if (connection.nonEmpty()) { layout.block(Blocks.CONNECTION_MATRIX, connection::compile); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOConsumer.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOConsumer.java deleted file mode 100644 index 6d9e062b..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOConsumer.java +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2022 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build; - -import java.io.IOException; -import java.nio.ByteBuffer; - -/** - * A version of {@link java.util.function.Consumer} which allows throwing - * IOException - */ -@FunctionalInterface -public interface IOConsumer { - T accept(ByteBuffer arg) throws IOException; -} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java index 2491ca91..ee0a0a9f 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java @@ -22,7 +22,6 @@ import java.io.IOException; import java.nio.ByteBuffer; -import java.nio.ByteOrder; import java.nio.charset.StandardCharsets; import java.util.*; @@ -36,7 +35,7 @@ * WordIdTable also contins word-ids that are not indexed in TRIE, so that we * can iterate over all word entries. */ -public class Index implements WriteDictionary { +public class Index { private final SortedMap elements = new TreeMap<>((byte[] l, byte[] r) -> { int llen = l.length; int rlen = r.length; @@ -48,8 +47,6 @@ public class Index implements WriteDictionary { return l.length - r.length; }); - private int count = 0; - /** * Add a (headword, wordid) pair to the index * @@ -61,54 +58,9 @@ public int add(String key, int wordId) { byte[] bytes = key.getBytes(StandardCharsets.UTF_8); Ints entries = elements.computeIfAbsent(bytes, k -> new Ints(4)); entries.append(wordId); - count += 1; return bytes.length; } - public void writeTo(ModelOutput output) throws IOException { - DoubleArray trie = new DoubleArray(); - - int size = this.elements.size(); - - byte[][] keys = new byte[size][]; - int[] values = new int[size]; - ByteBuffer wordIdTable = ByteBuffer.allocate(count * (4 + 2)); - wordIdTable.order(ByteOrder.LITTLE_ENDIAN); - - output.withSizedPart("WordId table", () -> { - int i = 0; - int numEntries = this.elements.entrySet().size(); - for (Map.Entry entry : this.elements.entrySet()) { - keys[i] = entry.getKey(); - values[i] = wordIdTable.position(); - i++; - Ints wordIds = entry.getValue(); - int length = wordIds.length(); - wordIdTable.put((byte) length); - for (int word = 0; word < length; ++word) { - int wid = wordIds.get(word); - wordIdTable.putInt(wid); - } - output.progress(i, numEntries); - } - return wordIdTable.position() + 4; - }); - - DicBuffer buffer = new DicBuffer(4); - output.withPart("double array Trie", () -> { - trie.build(keys, values, output::progress); - buffer.putInt(trie.size()); - buffer.consume(output::write); - output.write(trie.byteArray()); - }); - - buffer.putInt(wordIdTable.position()); - buffer.consume(output::write); - - wordIdTable.flip(); - output.write(wordIdTable); - } - /** * Write word id table and trie to the provided block layout. * diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ModelOutput.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ModelOutput.java deleted file mode 100644 index 1d215acd..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ModelOutput.java +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright (c) 2021 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.channels.SeekableByteChannel; -import java.util.ArrayList; -import java.util.List; - -public class ModelOutput implements SeekableByteChannel { - @FunctionalInterface - interface IORunnable { - void run() throws IOException; - } - - @FunctionalInterface - interface SizedRunnable { - long run() throws IOException; - } - - public static class Part { - private final String name; - private final long time; - private final long size; - - public Part(String name, long time, long size) { - this.name = name; - this.time = time; - this.size = size; - } - - public String getName() { - return name; - } - - public long getTime() { - return time; - } - - public long getSize() { - return size; - } - } - - private final SeekableByteChannel internal; - private final List parts = new ArrayList<>(); - private Progress progressor; - - public ModelOutput(SeekableByteChannel internal) { - this.internal = internal; - } - - public void progressor(Progress progress) { - this.progressor = progress; - } - - @Override - public int read(ByteBuffer byteBuffer) throws IOException { - return internal.read(byteBuffer); - } - - @Override - public int write(ByteBuffer byteBuffer) throws IOException { - return internal.write(byteBuffer); - } - - @Override - public long position() throws IOException { - return internal.position(); - } - - @Override - public SeekableByteChannel position(long l) throws IOException { - return internal.position(l); - } - - @Override - public long size() throws IOException { - return internal.size(); - } - - @Override - public SeekableByteChannel truncate(long l) throws IOException { - return internal.truncate(l); - } - - @Override - public boolean isOpen() { - return internal.isOpen(); - } - - @Override - public void close() throws IOException { - internal.close(); - } - - public void withPart(String name, IORunnable inner) throws IOException { - long pos = position(); - long start = System.nanoTime(); - if (progressor != null) { - progressor.startBlock(name, start, Progress.Kind.BYTE); - } - inner.run(); - long time = System.nanoTime() - start; - long size = position() - pos; - if (progressor != null) { - progressor.endBlock(size, time); - } - parts.add(new Part(name, time, size)); - } - - public void withSizedPart(String name, SizedRunnable inner) throws IOException { - long start = System.nanoTime(); - if (progressor != null) { - progressor.startBlock(name, start, Progress.Kind.BYTE); - } - long size = inner.run(); - long time = System.nanoTime() - start; - if (progressor != null) { - progressor.endBlock(size, time); - } - parts.add(new Part(name, time, size)); - } - - public List getParts() { - return parts; - } - - public void progress(long current, long max) { - if (progressor != null) { - progressor.progress(current, max); - } - } -} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java index e74a11c6..3323e485 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java @@ -27,7 +27,7 @@ /** * Dictionary parts: List of part-of-speeches. */ -public class POSTable implements WriteDictionary { +public class POSTable { private final List table = new ArrayList<>(); private final HashMap lookup = new HashMap<>(); private int builtin = 0; @@ -70,24 +70,6 @@ List getList() { return table; } - @Override - public void writeTo(ModelOutput output) throws IOException { - output.withPart("POS table", () -> { - DicBuffer buffer = new DicBuffer(128 * 1024); - buffer.putShort((short) ownedLength()); - for (int i = builtin; i < table.size(); ++i) { - for (String s : table.get(i)) { - if (!buffer.put(s)) { - // handle buffer overflow, this should be extremely rare - buffer.consume(output::write); - buffer.put(s); - } - } - } - buffer.consume(output::write); - }); - } - /** * @return number of non-builtin POSs. */ diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Parameters.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Parameters.java deleted file mode 100644 index 5769a301..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Parameters.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2021 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.ShortBuffer; - -/** - * Compiles model parameters into the binary format - */ -public class Parameters implements WriteDictionary { - private ByteBuffer data; - private ShortBuffer params; - private int maxLeft = Integer.MAX_VALUE; - private int maxRight = Integer.MAX_VALUE; - - public Parameters(int initialSize) { - data = ByteBuffer.allocate(initialSize); - data.order(ByteOrder.LITTLE_ENDIAN); - params = data.asShortBuffer(); - } - - public Parameters() { - this(1024 * 1024); // default 1M - } - - public void add(short left, short right, short cost) { - maybeResize(); - if (left >= maxLeft) { - throw new IllegalArgumentException(String.format("left %d is larger than max value %d", left, maxLeft)); - } - if (right >= maxRight) { - throw new IllegalArgumentException(String.format("right %d is larger than max value %d", right, maxRight)); - } - params.put(left); - params.put(right); - params.put(cost); - } - - public void setLimits(int left, int right) { - this.maxLeft = left; - this.maxRight = right; - } - - private void maybeResize() { - if (params.remaining() < 3) { - ByteBuffer newData = ByteBuffer.allocate(data.capacity() * 2); - newData.order(ByteOrder.LITTLE_ENDIAN); - int position = params.position(); - data.position(0); - data.limit(position * 2); - newData.put(data); - newData.clear(); - data = newData; - params = newData.asShortBuffer(); - params.position(position); - assert params.remaining() > 3; - } - } - - @Override - public void writeTo(ModelOutput output) throws IOException { - output.withPart("word parameters", () -> { - data.limit(params.position() * 2); - output.write(data); - }); - } -} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index 3de326e0..9b9e69e4 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -188,6 +188,10 @@ private List getWordRefs(List data, Column column, WordRef.Pars private RawWordEntry convertEntry(List data) { RawWordEntry entry = new RawWordEntry(); entry.headword = get(data, Column.Surface, true); + if (entry.headword.isEmpty()) { + throw new IllegalArgumentException("headword cannot be empty"); + } + entry.leftId = getShort(data, Column.LeftId); entry.rightId = getShort(data, Column.RightId); entry.cost = getShort(data, Column.Cost); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Stats.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Stats.java deleted file mode 100644 index 3c918609..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Stats.java +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2022 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build; - -import java.time.Duration; - -public class Stats { - public enum Kind { - Input, Output - } - - public static class Element { - public Kind kind; - public String name; - public long size; - public Duration duration; - } -} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java index ce7d97fa..4d55187f 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java @@ -16,17 +16,10 @@ package com.worksap.nlp.sudachi.dictionary.build; -import com.worksap.nlp.sudachi.dictionary.CSVParser; import com.worksap.nlp.sudachi.dictionary.StringPtr; -import java.io.BufferedReader; import java.io.IOException; -import java.nio.channels.SeekableByteChannel; import java.nio.channels.WritableByteChannel; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.nio.file.StandardOpenOption; import java.util.*; /** @@ -149,25 +142,6 @@ public void writeCompact(WritableByteChannel channel) throws IOException { layout.write(channel); } - /** - * legacy string compilation. only for comparison purpose. - * - * @param channel - * @throws IOException - */ - public void writeLengthPrefixedCompact(SeekableByteChannel channel) throws IOException { - DicBuffer buf = new DicBuffer(64 * 1024); - for (Map.Entry item : strings.entrySet()) { - Item value = item.getValue(); - String sub = value.data.substring(value.start, value.end); - if (buf.wontFit(sub.length() * 2)) { - buf.consume(channel::write); - } - buf.put(sub); - } - buf.consume(channel::write); - } - /** * Data class of string and its pointer. */ @@ -205,38 +179,4 @@ public int getLength() { } } - /** - * Save strings in the lexicon csv (first arg) with legacy/compressed format - * with given name (second arg). - * - * Use this to compare output size of each format. - */ - public static void main(String[] args) throws IOException { - StringStorage strings = new StringStorage(); - try (BufferedReader reader = Files.newBufferedReader(Paths.get(args[0]))) { - CSVParser parser = new CSVParser(reader); - List record; - while ((record = parser.getNextRecord()) != null) { - strings.add(record.get(0)); - strings.add(record.get(4)); - strings.add(record.get(11)); - strings.add(record.get(12)); - } - parser.close(); - } - strings.compile(null); - - Path fullName = Paths.get(args[1] + ".lpf"); - try (SeekableByteChannel chan = Files.newByteChannel(fullName, StandardOpenOption.CREATE, - StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING)) { - strings.writeLengthPrefixedCompact(chan); - } - - Path compactName = Paths.get(args[1] + ".cmp"); - try (SeekableByteChannel chan = Files.newByteChannel(compactName, StandardOpenOption.CREATE, - StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING)) { - strings.writeCompact(chan); - } - System.out.printf("wasted bytes=%d, slots=%d%n", strings.layout.wastedBytes(), strings.layout.numSlots()); - } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/TrackingInputStream.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/TrackingInputStream.java deleted file mode 100644 index a83bbd25..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/TrackingInputStream.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2021 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build; - -import java.io.IOException; -import java.io.InputStream; - -public class TrackingInputStream extends InputStream { - private final InputStream inner; - private long position; - - public TrackingInputStream(InputStream inner) { - this.inner = inner; - } - - @Override - public int read() throws IOException { - return inner.read(); - } - - @Override - public int read(byte[] b) throws IOException { - int read = inner.read(b); - if (read != -1) { - position += read; - } - return read; - } - - @Override - public int read(byte[] b, int off, int len) throws IOException { - int read = inner.read(b, off, len); - if (read != -1) { - position += read; - } - return read; - } - - @Override - public long skip(long n) throws IOException { - position += n; - return super.skip(n); - } - - public long getPosition() { - return position; - } -} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordIdResolver.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordIdResolver.java deleted file mode 100644 index 804f8e89..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordIdResolver.java +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2021 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build; - -import com.worksap.nlp.sudachi.dictionary.Ints; - -public interface WordIdResolver { - int lookup(String headword, short posId, String reading); - - void validate(int wordId); - - boolean isUser(); - - default byte parseList(String data, Ints result) { - return 0; - } -} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLookup.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLookup.java deleted file mode 100644 index f2be0dce..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLookup.java +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (c) 2021 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build; - -import com.worksap.nlp.sudachi.WordId; -import com.worksap.nlp.sudachi.dictionary.Lexicon; - -import java.util.List; - -public class WordLookup { - private WordLookup() { - } -} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WriteDictionary.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WriteDictionary.java deleted file mode 100644 index addcd266..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WriteDictionary.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 2021 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build; - -import java.io.IOException; - -public interface WriteDictionary { - void writeTo(ModelOutput output) throws IOException; -} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderTest.java deleted file mode 100644 index 0af83c99..00000000 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderTest.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2017-2022 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.io.IOException; - -import com.worksap.nlp.sudachi.TestDictionary; -import org.junit.Before; -import org.junit.Ignore; -import org.junit.Test; - -public class DictionaryHeaderTest { - DictionaryHeader header; - - @Before - public void setUp() throws IOException { - header = new DictionaryHeader(TestDictionary.INSTANCE.getSystemDictData().buffer(), 0); - } - - @Test - @Ignore - public void getVersion() { - assertEquals(DictionaryVersion.SYSTEM_DICT_VERSION_2, header.getVersion()); - } - - @Test - @Ignore - public void getCreateTime() { - assertTrue(header.getCreateTime() > 0); - } - - @Test - @Ignore - public void getDescription() { - assertEquals("the system dictionary for the unit tests", header.getDescription()); - } -} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/LegacyDictionaryHeaderPrinterTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/LegacyDictionaryHeaderPrinterTest.java deleted file mode 100644 index b69f6bca..00000000 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/LegacyDictionaryHeaderPrinterTest.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) 2017-2022 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary; - -import static org.hamcrest.core.StringStartsWith.startsWith; -import static org.hamcrest.CoreMatchers.is; -import static org.hamcrest.MatcherAssert.assertThat; - -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.IOException; -import java.io.PrintStream; - -import com.worksap.nlp.sudachi.TestDictionary; -import com.worksap.nlp.sudachi.Utils; - -import org.junit.Before; -import org.junit.Ignore; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class LegacyDictionaryHeaderPrinterTest { - - @Rule - public TemporaryFolder temporaryFolder = new TemporaryFolder(); - - @Before - public void setUp() throws IOException { - TestDictionary.INSTANCE.getSystemDictData().writeData(temporaryFolder.getRoot().toPath().resolve("system.dic")); - TestDictionary.INSTANCE.getUserDict1Data().writeData(temporaryFolder.getRoot().toPath().resolve("user.dic")); - Utils.copyResource(temporaryFolder.getRoot().toPath(), "/unk.def"); - } - - @Test - @Ignore - public void printHeaderWithSystemDict() throws IOException { - File inputFile = new File(temporaryFolder.getRoot(), "system.dic"); - String[] actuals; - try (ByteArrayOutputStream output = new ByteArrayOutputStream(); PrintStream ps = new PrintStream(output)) { - DictionaryHeaderPrinter.printHeader(inputFile.getPath(), ps); - actuals = output.toString().split(System.lineSeparator()); - } - assertThat(actuals.length, is(4)); - assertThat(actuals[0], is("filename: " + inputFile.getPath())); - assertThat(actuals[1], is("type: system dictionary")); - assertThat(actuals[2], is(startsWith("createTime: "))); - assertThat(actuals[3], is("description: the system dictionary for the unit tests")); - } - - @Test - @Ignore - public void printHeaderWithUserDict() throws IOException { - File inputFile = new File(temporaryFolder.getRoot(), "user.dic"); - String[] actuals; - try (ByteArrayOutputStream output = new ByteArrayOutputStream(); PrintStream ps = new PrintStream(output)) { - DictionaryHeaderPrinter.printHeader(inputFile.getPath(), ps); - actuals = output.toString().split(System.lineSeparator()); - } - assertThat(actuals.length, is(4)); - assertThat(actuals[0], is("filename: " + inputFile.getPath())); - assertThat(actuals[1], is("type: user dictionary")); - assertThat(actuals[2], is(startsWith("createTime: "))); - assertThat(actuals[3], is("description: ")); - } - - @Test - @Ignore - public void printHeaderWithInvalidFile() throws IOException { - File inputFile = new File(temporaryFolder.getRoot(), "unk.def"); - String[] actuals; - try (ByteArrayOutputStream output = new ByteArrayOutputStream(); PrintStream ps = new PrintStream(output)) { - DictionaryHeaderPrinter.printHeader(inputFile.getPath(), ps); - actuals = output.toString().split(System.lineSeparator()); - } - assertThat(actuals.length, is(2)); - assertThat(actuals[0], is("filename: " + inputFile.getPath().replaceAll("\r", ""))); - assertThat(actuals[1], is("invalid file")); - } -} \ No newline at end of file diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/LegacyDictionaryPrinterTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/LegacyDictionaryPrinterTest.java deleted file mode 100644 index 0c2d618e..00000000 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/LegacyDictionaryPrinterTest.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2017-2022 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary; - -import static org.hamcrest.CoreMatchers.is; -import static org.hamcrest.MatcherAssert.assertThat; - -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.IOException; -import java.io.PrintStream; -import java.nio.file.Path; - -import com.worksap.nlp.sudachi.TestDictionary; -import com.worksap.nlp.sudachi.Utils; - -import org.junit.Before; -import org.junit.Ignore; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class LegacyDictionaryPrinterTest { - - @Rule - public TemporaryFolder temporaryFolder = new TemporaryFolder(); - - @Before - public void setUp() throws IOException { - TestDictionary td = TestDictionary.INSTANCE; - Path folder = temporaryFolder.getRoot().toPath(); - td.getSystemDictData().writeData(folder.resolve("system.dic")); - td.getUserDict1Data().writeData(folder.resolve("user.dic")); - Utils.copyResource(folder, "/unk.def"); - } - - @Test - @Ignore - public void printWithSystemDict() throws IOException { - File inputFile = new File(temporaryFolder.getRoot(), "system.dic"); - String[] actuals; - try (ByteArrayOutputStream output = new ByteArrayOutputStream(); PrintStream ps = new PrintStream(output)) { - DictionaryPrinter.printDictionary(inputFile.getPath(), null, ps); - actuals = output.toString().split(System.lineSeparator()); - } - assertThat(actuals.length, is(39)); - assertThat(actuals[0], is("た,1,1,8729,た,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,*,A,*,*,*")); - } - - @Test - @Ignore - public void printWithUserDict() throws IOException { - File inputFile = new File(temporaryFolder.getRoot(), "user.dic"); - File systemDictFile = new File(temporaryFolder.getRoot(), "system.dic"); - try (BinaryDictionary systemDict = BinaryDictionary.loadSystem(systemDictFile.getPath())) { - String[] actuals; - try (ByteArrayOutputStream output = new ByteArrayOutputStream(); PrintStream ps = new PrintStream(output)) { - DictionaryPrinter.printDictionary(inputFile.getPath(), systemDict, ps); - actuals = output.toString().split(System.lineSeparator()); - } - assertThat(actuals.length, is(4)); - assertThat(actuals[2], is("東京府,6,6,2816,東京府,名詞,固有名詞,地名,一般,*,*,トウキョウフ,東京府,*,B,5/U1,*,5/U1")); - assertThat(actuals[3], is("すだち,6,6,2816,すだち,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,すだち,*,A,*,*,*")); - } - } - - @Test(expected = IllegalArgumentException.class) - @Ignore - public void printWithUserDictWithoutGrammar() throws IOException { - File inputFile = new File(temporaryFolder.getRoot(), "user.dic"); - try (ByteArrayOutputStream output = new ByteArrayOutputStream(); PrintStream ps = new PrintStream(output)) { - DictionaryPrinter.printDictionary(inputFile.getPath(), null, ps); - } - } - - @Test(expected = IOException.class) - @Ignore - public void readGrammarWithInvalidFile() throws IOException { - File inputFile = new File(temporaryFolder.getRoot(), "unk.def"); - BinaryDictionary.loadSystem(inputFile.getPath()); - } -} \ No newline at end of file diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexiconTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexiconTest.kt deleted file mode 100644 index d255ff6d..00000000 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/CsvLexiconTest.kt +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2022 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build - -import kotlin.test.Ignore -import kotlin.test.Test -import kotlin.test.assertEquals -import kotlin.test.assertFails - -class CsvLexiconTest { - @Test - fun failEntryIsSmall() { - val clex = CsvLexicon(POSTable()) - (0..18).forEach { - val data = generateSequence { "a" }.take(it).toList() - assertFails { clex.parseLine(data) } - } - } - - @Test - fun failEntryHasTooLongString() { - val clex = CsvLexicon(POSTable()) - val data = "東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,*".split(",") - assertFails { - val copy = data.toList().toMutableList() - copy[0] = "a".repeat(DicBuffer.MAX_STRING + 1) - clex.parseLine(copy) - } - assertFails { - val copy = data.toList().toMutableList() - copy[4] = "a".repeat(DicBuffer.MAX_STRING + 1) - clex.parseLine(copy) - } - assertFails { - val copy = data.toList().toMutableList() - copy[11] = "a".repeat(DicBuffer.MAX_STRING + 1) - clex.parseLine(copy) - } - assertFails { - val copy = data.toList().toMutableList() - copy[12] = "a".repeat(DicBuffer.MAX_STRING + 1) - clex.parseLine(copy) - } - } - - @Test - fun failEmptyHeadword() { - val clex = CsvLexicon(POSTable()) - val data = ",1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,*".split(",") - assertFails { clex.parseLine(data) } - } - - @Test - fun failInvalidSplitting() { - val clex = CsvLexicon(POSTable()) - assertFails { - clex.parseLine("a,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,1,*,*,*".split(",")) - } - assertFails { - clex.parseLine("a,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,1,*,*".split(",")) - } - } - - @Test - @Ignore - fun failTooManyUnits() { - val clex = CsvLexicon(POSTable()) - val data = "東京,1,1,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,C,*,*,*,*".split(",") - assertFails { - val copy = data.toList().toMutableList() - copy[15] = (0..256).joinToString("/") { it.toString() } - clex.parseLine(copy) - } - assertFails { - val copy = data.toList().toMutableList() - copy[16] = (0..256).joinToString("/") { it.toString() } - clex.parseLine(copy) - } - assertFails { - val copy = data.toList().toMutableList() - copy[17] = (0..256).joinToString("/") { it.toString() } - clex.parseLine(copy) - } - assertFails { - val copy = data.toList().toMutableList() - copy[18] = (0..256).joinToString("/") { it.toString() } - clex.parseLine(copy) - } - } - - @Test - fun unescape() { - assertEquals("test", Unescape.unescape("""test""")) - assertEquals("\u0000", Unescape.unescape("""\u0000""")) - assertEquals("a\u0000a", Unescape.unescape("""a\u0000a""")) - assertEquals("あ", Unescape.unescape("""\u3042""")) - assertEquals("あ5", Unescape.unescape("""\u30425""")) - assertEquals("💕", Unescape.unescape("""\u{1f495}""")) - assertEquals("a💕x", Unescape.unescape("""a\u{1f495}x""")) - assertEquals("\udbff\udfff", Unescape.unescape("""\u{10ffff}""")) - } - - @Test - fun unescapeFails() { - assertFails { Unescape.unescape("""\u{FFFFFF}""") } - assertFails { Unescape.unescape("""\u{110000}""") } // 0x10ffff is the largest codepoint - } -} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/DicBufferTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/DicBufferTest.kt deleted file mode 100644 index 757fd408..00000000 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/DicBufferTest.kt +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2022 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build - -import java.nio.ByteOrder -import kotlin.test.* - -class DicBufferTest { - @Test - fun writeEmptyIntArray() { - val s = DicBuffer(1024) - s.putInts(intArrayOf()) - val bb = s.consume { it.duplicate() } - assertEquals(bb.remaining(), 1) - assertEquals(bb.get(), 0) - assertEquals(bb.remaining(), 0) - } - - @Test - fun writeIntArray() { - val s = DicBuffer(1024) - s.putInts(intArrayOf(1, 2, 3)) - val bb = s.consume { it.duplicate() } - bb.order(ByteOrder.LITTLE_ENDIAN) - assertEquals(bb.remaining(), 4 * 3 + 1) - assertEquals(bb.get(), 3) - assertEquals(bb.getInt(), 1) - assertEquals(bb.getInt(), 2) - assertEquals(bb.getInt(), 3) - assertEquals(bb.remaining(), 0) - } - - @Test - fun writeEmptyString() { - val s = DicBuffer(1024) - s.put("") - val bb = s.consume { it.duplicate() } - assertEquals(bb.remaining(), 1) - assertEquals(bb.get(), 0) - assertEquals(bb.remaining(), 0) - } - - @Test - fun writeSmallString() { - val s = DicBuffer(1024) - s.put("あ𠮟") - val bb = s.consume { it.duplicate() } - bb.order(ByteOrder.LITTLE_ENDIAN) - assertEquals(bb.remaining(), 1 + 2 * 3) - assertEquals(bb.get(), 3) - assertEquals(bb.getChar(), 'あ') - assertEquals(bb.getChar(), '\uD842') - assertEquals(bb.getChar(), '\uDF9F') - assertEquals(bb.remaining(), 0) - } - - @Test - fun writeLargeString() { - val s = DicBuffer(1024) - val str = "0123456789".repeat(20) - s.put(str) - val bb = s.consume { it.duplicate() } - bb.order(ByteOrder.LITTLE_ENDIAN) - val length = str.length - assertEquals(bb.remaining(), 2 + length * 2) - assertEquals(bb.get(), (length shr 8 or 0x80).toByte()) - assertEquals(bb.get(), (length and 0xff).toByte()) - } - - @Test - fun failWriteHugeString() { - val s = DicBuffer(1024) - val str = "0123456789".repeat(DicBuffer.MAX_STRING / 10 + 1) - assertFails { s.put(str) } - } - - @Test - fun checkedPut() { - val s = DicBuffer(10) - assertTrue { s.put("asdf") } - assertFalse { s.put("asdf") } - } -} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/GrammarTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/GrammarTest.kt index 50815866..b999c287 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/GrammarTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/GrammarTest.kt @@ -16,6 +16,8 @@ package com.worksap.nlp.sudachi.dictionary.build +import com.worksap.nlp.sudachi.dictionary.Blocks +import com.worksap.nlp.sudachi.dictionary.Description import com.worksap.nlp.sudachi.dictionary.GrammarImpl import com.worksap.nlp.sudachi.dictionary.POS import kotlin.test.assertEquals @@ -30,11 +32,17 @@ class GrammarTest { val pos = POSTable() assertEquals(0, pos.getId(POS("a", "b", "c", "d", "e", "f"))) val outbuf = MemChannel() - val out = ModelOutput(outbuf) - pos.writeTo(out) - cm.writeTo(out) - val gram = GrammarImpl(outbuf.buffer(), 0) - assertEquals(gram.getPartOfSpeechString(0), POS("a", "b", "c", "d", "e", "f")) + val layout = BlockLayout(outbuf) + layout.block(Blocks.POS_TABLE, pos::compile) + layout.block(Blocks.CONNECTION_MATRIX, cm::compile) + val description = Description() + description.setBlocks(layout.blocks()) + val grammar = GrammarImpl.load(outbuf.buffer(), description) + // val out = ModelOutput(outbuf) + // pos.writeTo(out) + // cm.writeTo(out) + // val gram = GrammarImpl(outbuf.buffer(), 0) + assertEquals(grammar.getPartOfSpeechString(0), POS("a", "b", "c", "d", "e", "f")) } @Test @@ -71,14 +79,23 @@ class GrammarTest { val cm = ConnectionMatrix() Res("test.matrix") { cm.readEntries(it) } val outbuf = MemChannel() - val out = ModelOutput(outbuf) - posTable.writeTo(out) - cm.writeTo(out) - val gram = GrammarImpl(outbuf.buffer(), 0) - assertEquals(gram.partOfSpeechSize, 1024) + + val layout = BlockLayout(outbuf) + layout.block(Blocks.POS_TABLE, posTable::compile) + layout.block(Blocks.CONNECTION_MATRIX, cm::compile) + val description = Description() + description.setBlocks(layout.blocks()) + val grammar = GrammarImpl.load(outbuf.buffer(), description) + + // val out = ModelOutput(outbuf) + // posTable.writeTo(out) + // cm.writeTo(out) + // val gram = GrammarImpl(outbuf.buffer(), 0) + + assertEquals(grammar.partOfSpeechSize, 1024) repeat(1024) { val pos = POS(e, e, e, e, e, it.toString()) - assertEquals(pos, gram.getPartOfSpeechString(it.toShort())) + assertEquals(pos, grammar.getPartOfSpeechString(it.toShort())) } } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java similarity index 100% rename from src/main/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java rename to src/test/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/ParametersTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/ParametersTest.kt deleted file mode 100644 index 04a84ba9..00000000 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/ParametersTest.kt +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2017-2022 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build - -import kotlin.test.Test -import kotlin.test.assertEquals - -class ParametersTest { - @Test - fun resizeWorks() { - val params = Parameters(4) - params.add(1, 1, 1) - params.add(2, 2, 2) - val ch = MemChannel() - val out = ModelOutput(ch) - params.writeTo(out) - assertEquals(ch.position(), 12) - val b = ch.buffer() - assertEquals(b.short, 1) - assertEquals(b.short, 1) - assertEquals(b.short, 1) - assertEquals(b.short, 2) - assertEquals(b.short, 2) - assertEquals(b.short, 2) - assertEquals(b.remaining(), 0) - } -} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt index a1a11c16..337ed6ee 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt @@ -17,24 +17,31 @@ package com.worksap.nlp.sudachi.dictionary.build import com.worksap.nlp.sudachi.dictionary.CSVParser +import com.worksap.nlp.sudachi.dictionary.StringPtr import com.worksap.nlp.sudachi.resStream +import java.io.StringReader import kotlin.test.Test import kotlin.test.assertEquals +import kotlin.test.assertFails import kotlin.test.assertNotNull import kotlin.test.assertNull import kotlin.test.assertTrue class RawLexiconReaderTest { companion object { - fun csv(name: String): CSVParser { + fun csvfile(name: String): CSVParser { val stream = resStream(name) return CSVParser(stream.reader()) } + + fun csvtext(content: String): CSVParser { + return CSVParser(StringReader(content)) + } } @Test fun legacyCsvWithMinimumFields() { - val reader = RawLexiconReader(csv("legacy-minimum.csv"), POSTable(), false) + val reader = RawLexiconReader(csvfile("legacy-minimum.csv"), POSTable(), false) assertNotNull(reader.nextEntry()).let { e -> assertEquals("東京都", e.headword) assertEquals("トウキョウト", e.reading) @@ -47,7 +54,7 @@ class RawLexiconReaderTest { @Test fun legacyCsvWithAllFields() { - val reader = RawLexiconReader(csv("legacy-full.csv"), POSTable(), false) + val reader = RawLexiconReader(csvfile("legacy-full.csv"), POSTable(), false) assertNotNull(reader.nextEntry()).let { e -> assertEquals("東京都", e.headword) assertEquals("トウキョウト", e.reading) @@ -62,7 +69,7 @@ class RawLexiconReaderTest { @Test fun headerCsvAllFields() { - val reader = RawLexiconReader(csv("headers-all.csv"), POSTable(), false) + val reader = RawLexiconReader(csvfile("headers-all.csv"), POSTable(), false) assertNotNull(reader.nextEntry()).let { e -> assertEquals("東京都", e.headword) assertEquals("トウキョウト", e.reading) @@ -74,4 +81,126 @@ class RawLexiconReaderTest { } assertNull(reader.nextEntry()) } + + @Test + fun failMissingRequiredEntry() { + val columns = + "Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure".split( + ",") + val values = "東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,".split(",") + + for (i in columns.indices) { + val skipCols = columns.toMutableList() // copy + skipCols.removeAt(i) + val skipVals = values.toMutableList() // copy + skipVals.removeAt(i) + + val text = skipCols.joinToString(",") + "\n" + skipVals.joinToString(",") + assertFails { + val reader = RawLexiconReader(csvtext(text), POSTable(), false) + } + } + } + + @Test + fun failTooLongValue() { + val oversizeWord = "a".repeat(StringPtr.MAX_LENGTH + 1); + { + val text = + """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure +${oversizeWord},6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,1,,,""" + val reader = RawLexiconReader(csvtext(text), POSTable(), false) + assertFails { reader.nextEntry() } + } + { + val text = + """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,${oversizeWord},,,1,,,""" + val reader = RawLexiconReader(csvtext(text), POSTable(), false) + assertFails { reader.nextEntry() } + } + { + val text = + """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,${oversizeWord},,1,,,""" + val reader = RawLexiconReader(csvtext(text), POSTable(), false) + assertFails { reader.nextEntry() } + } + } + + @Test + fun failEmptyHeadword() { + val text = + """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure +,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,""" + val reader = RawLexiconReader(csvtext(text), POSTable(), false) + assertFails { reader.nextEntry() } + } + + @Test + fun failSingleSplit() { + { + val text = + """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,1,,,""" + val reader = RawLexiconReader(csvtext(text), POSTable(), false) + assertFails { reader.nextEntry() } + } + { + val text = + """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,1,,""" + val reader = RawLexiconReader(csvtext(text), POSTable(), false) + assertFails { reader.nextEntry() } + } + { + val text = + """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,1,""" + val reader = RawLexiconReader(csvtext(text), POSTable(), false) + assertFails { reader.nextEntry() } + } + { + val text = + """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,,1""" + val reader = RawLexiconReader(csvtext(text), POSTable(), false) + assertFails { reader.nextEntry() } + } + } + + @Test + fun failTooManySplit() { + val oversizeSplit: String = + generateSequence { "1" }.take(Byte.MAX_VALUE.toInt() + 1).joinToString("/"); + + { + var text = + """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,${oversizeSplit},,,""" + var reader = RawLexiconReader(csvtext(text), POSTable(), false) + assertFails { reader.nextEntry() } + } + { + val text = + """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,${oversizeSplit},,""" + val reader = RawLexiconReader(csvtext(text), POSTable(), false) + assertFails { reader.nextEntry() } + } + { + val text = + """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,${oversizeSplit},""" + val reader = RawLexiconReader(csvtext(text), POSTable(), false) + assertFails { reader.nextEntry() } + } + { + val text = + """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,,${oversizeSplit}""" + val reader = RawLexiconReader(csvtext(text), POSTable(), false) + assertFails { reader.nextEntry() } + } + } } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UnescapeTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UnescapeTest.kt new file mode 100644 index 00000000..79dbb79f --- /dev/null +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UnescapeTest.kt @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build + +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertFails + +class UnescapeTest { + + @Test + fun unescape() { + assertEquals("test", Unescape.unescape("""test""")) + assertEquals("\u0000", Unescape.unescape("""\u0000""")) + assertEquals("a\u0000a", Unescape.unescape("""a\u0000a""")) + assertEquals("あ", Unescape.unescape("""\u3042""")) + assertEquals("あ5", Unescape.unescape("""\u30425""")) + assertEquals("💕", Unescape.unescape("""\u{1f495}""")) + assertEquals("a💕x", Unescape.unescape("""a\u{1f495}x""")) + assertEquals("\udbff\udfff", Unescape.unescape("""\u{10ffff}""")) + } + + @Test + fun unescapeFails() { + assertFails { Unescape.unescape("""\u{FFFFFF}""") } + assertFails { Unescape.unescape("""\u{110000}""") } // 0x10ffff is the largest codepoint + } +} From 1ec8cd1eb54ad82ae58f5c15cadc2ac40a487a4e Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Thu, 1 Aug 2024 17:10:32 +0900 Subject: [PATCH 53/94] mv classes and vorbose CSVException --- .../nlp/sudachi/dictionary/Description.java | 4 +- ...sion.java => LegacyDictionaryVersion.java} | 8 ++- .../sudachi/dictionary/build/BlockOutput.java | 5 ++ .../dictionary/{ => build}/CSVParser.java | 2 +- .../dictionary/build/CsvFieldException.java | 4 +- .../sudachi/dictionary/build/DicBuilder.java | 5 ++ .../sudachi/dictionary/build/IOFunction.java | 24 ------- .../sudachi/dictionary/build/IOSupplier.java | 24 ------- .../sudachi/dictionary/build/RawLexicon.java | 1 - .../dictionary/build/RawLexiconReader.java | 64 +++++++++++++------ .../dictionary/build/RawWordEntry.java | 2 +- .../nlp/sudachi/dictionary/build/WordRef.java | 3 +- .../dictionary/{ => build}/CSVParserTest.java | 2 +- .../dictionary/build/RawLexiconReaderTest.kt | 1 - 14 files changed, 67 insertions(+), 82 deletions(-) rename src/main/java/com/worksap/nlp/sudachi/dictionary/{DictionaryVersion.java => LegacyDictionaryVersion.java} (91%) rename src/main/java/com/worksap/nlp/sudachi/dictionary/{ => build}/CSVParser.java (99%) delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOFunction.java delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOSupplier.java rename src/test/java/com/worksap/nlp/sudachi/dictionary/{ => build}/CSVParserTest.java (99%) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java index 6c478ff5..43eeaf4e 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java @@ -211,10 +211,10 @@ private static void checkMagic(ByteBuffer raw) { private static void checkLegacyDictionaryFormat(ByteBuffer raw) { long version = raw.getLong(0); - if (DictionaryVersion.isSystemDictionary(version)) { + if (LegacyDictionaryVersion.isSystemDictionary(version)) { throw new IllegalArgumentException("passed dictionary is a legacy system dictionary, please rebuild it"); } - if (DictionaryVersion.isUserDictionary(version)) { + if (LegacyDictionaryVersion.isUserDictionary(version)) { throw new IllegalArgumentException("passed dictionary is a legacy user dictionary, please rebuild it"); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryVersion.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/LegacyDictionaryVersion.java similarity index 91% rename from src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryVersion.java rename to src/main/java/com/worksap/nlp/sudachi/dictionary/LegacyDictionaryVersion.java index 88b99eba..6510f681 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryVersion.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/LegacyDictionaryVersion.java @@ -17,11 +17,13 @@ package com.worksap.nlp.sudachi.dictionary; /** - * Versions of dictionaries. + * Versions of (legacy) dictionaries. + * + * This is kept to detect legacy dictionary binary. */ -public class DictionaryVersion { +public class LegacyDictionaryVersion { - private DictionaryVersion() { + private LegacyDictionaryVersion() { } /** the first version of system dictionries */ diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockOutput.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockOutput.java index c5e66297..7fb7465d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockOutput.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockOutput.java @@ -44,6 +44,11 @@ public Progress getProgress() { return progress; } + @FunctionalInterface + public interface IOFunction { + R apply(T arg) throws IOException; + } + /** * Function decorator to measure output progress. * diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/CSVParser.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CSVParser.java similarity index 99% rename from src/main/java/com/worksap/nlp/sudachi/dictionary/CSVParser.java rename to src/main/java/com/worksap/nlp/sudachi/dictionary/build/CSVParser.java index debeeec1..839c226a 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/CSVParser.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CSVParser.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.worksap.nlp.sudachi.dictionary; +package com.worksap.nlp.sudachi.dictionary.build; import java.io.BufferedReader; import java.io.Closeable; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvFieldException.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvFieldException.java index cad55231..5434ec94 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvFieldException.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvFieldException.java @@ -17,7 +17,7 @@ package com.worksap.nlp.sudachi.dictionary.build; public class CsvFieldException extends IllegalArgumentException { - public CsvFieldException(String s) { - super(s); + public CsvFieldException(String file, int line, String column, Exception cause) { + super(String.format("[%s line %d, %s]", file, line, column), cause); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java index addf0c5c..864d3107 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java @@ -39,6 +39,11 @@ * for the format of the CSV dictionary. */ public class DicBuilder { + @FunctionalInterface + public interface IOSupplier { + T get() throws IOException; + } + private DicBuilder() { // no instances } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOFunction.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOFunction.java deleted file mode 100644 index fdfcda9a..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOFunction.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2022 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build; - -import java.io.IOException; - -@FunctionalInterface -public interface IOFunction { - R apply(T arg) throws IOException; -} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOSupplier.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOSupplier.java deleted file mode 100644 index ba7351e5..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/IOSupplier.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2022 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build; - -import java.io.IOException; - -@FunctionalInterface -public interface IOSupplier { - T get() throws IOException; -} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index 50247320..60475099 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -17,7 +17,6 @@ package com.worksap.nlp.sudachi.dictionary.build; import com.worksap.nlp.sudachi.dictionary.Blocks; -import com.worksap.nlp.sudachi.dictionary.CSVParser; import com.worksap.nlp.sudachi.dictionary.DoubleArrayLexicon; import com.worksap.nlp.sudachi.dictionary.Ints; import com.worksap.nlp.sudachi.dictionary.Lexicon; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index 9b9e69e4..8dfbbe85 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -16,7 +16,6 @@ package com.worksap.nlp.sudachi.dictionary.build; -import com.worksap.nlp.sudachi.dictionary.CSVParser; import com.worksap.nlp.sudachi.dictionary.Ints; import com.worksap.nlp.sudachi.dictionary.POS; @@ -31,7 +30,6 @@ * Reader for the lexicon csv file. */ public class RawLexiconReader { - /** * Enum order is in legacy csv order. If a header is present, fields will be * reordered with respect to the header. @@ -103,14 +101,15 @@ private void resolveColumnLayout() throws IOException { continue outer; } } - throw new IllegalArgumentException(String.format("column [%s] is not recognized", field)); + throw new CsvFieldException(parser.getName(), 0, field, + new IllegalArgumentException("Invalid column name")); } for (Column column : remaining) { if (column.required) { StringJoiner joiner = new StringJoiner(", ", "required columns [", "] were not present in the header"); remaining.stream().filter(c -> c.required).forEach(c -> joiner.add(c.name())); - throw new IllegalArgumentException(joiner.toString()); + throw new CsvFieldException(parser.getName(), 0, "", new IllegalArgumentException(joiner.toString())); } } @@ -125,8 +124,8 @@ private String get(List data, Column column, boolean unescape) { } if (index < 0 || index >= data.size()) { if (column.required) { - throw new CsvFieldException( - String.format("column [%s] (index=%d) was not present", column.name(), index)); + throw new CsvFieldException(parser.getName(), parser.getRow(), column.name(), + new IllegalArgumentException(String.format("column [%s] was not present", column.name()))); } else { return ""; } @@ -139,14 +138,23 @@ private String get(List data, Column column, boolean unescape) { } } + private String getNonEmpty(List data, Column column, boolean unescape) { + String value = get(data, column, unescape); + if (value.isEmpty()) { + throw new CsvFieldException(parser.getName(), parser.getRow(), column.name(), + new IllegalArgumentException(String.format("Column %s cannot be empty", column.name()))); + } + return value; + } + /** parse specified column as short */ private short getShort(List data, Column column) { String value = get(data, column, false); try { return Short.parseShort(value); } catch (NumberFormatException e) { - throw new CsvFieldException( - String.format("failed to parse '%s' as a short value in column: %s", value, column.name())); + throw new CsvFieldException(parser.getName(), parser.getRow(), column.name(), + new IllegalArgumentException(String.format("failed to parse '%s' as a short value", value))); } } @@ -158,7 +166,8 @@ private Ints getInts(List data, Column column) { } String[] parts = value.split("/"); if (parts.length > Byte.MAX_VALUE) { - throw new IllegalArgumentException("int list contained more than 127 entries: " + value); + throw new CsvFieldException(parser.getName(), parser.getRow(), column.name(), + new IllegalArgumentException("int list contained more than 127 entries: " + value)); } Ints result = new Ints(parts.length); for (String part : parts) { @@ -168,36 +177,48 @@ private Ints getInts(List data, Column column) { } /** parse specified column as WordRef list. */ - private List getWordRefs(List data, Column column, WordRef.Parser parser) { + private List getWordRefs(List data, Column column, WordRef.Parser refParser) { String value = get(data, column, false); if (value == null || value.isEmpty() || "*".equals(value)) { return new ArrayList<>(); } String[] parts = value.split("/"); if (parts.length > Byte.MAX_VALUE) { - throw new IllegalArgumentException("reference list contained more than 127 entries: " + value); + throw new CsvFieldException(parser.getName(), parser.getRow(), column.name(), + new IllegalArgumentException("reference list contained more than 127 entries: " + value)); } List result = new ArrayList<>(parts.length); for (String part : parts) { - result.add(parser.parse(part)); + try { + result.add(refParser.parse(part)); + } catch (IllegalArgumentException e) { + throw new CsvFieldException(parser.getName(), parser.getRow(), column.name(), e); + } } return result; } + /** parse specified column as WordRef. */ + private WordRef getWordRef(List data, Column column, WordRef.Parser refParser) { + String value = get(data, column, false); + try { + return refParser.parse(value); + } catch (IllegalArgumentException e) { + throw new CsvFieldException(parser.getName(), parser.getRow(), column.name(), e); + } + } + /** convert csv row to RawWordEntry */ private RawWordEntry convertEntry(List data) { RawWordEntry entry = new RawWordEntry(); - entry.headword = get(data, Column.Surface, true); - if (entry.headword.isEmpty()) { - throw new IllegalArgumentException("headword cannot be empty"); - } + entry.headword = getNonEmpty(data, Column.Surface, true); entry.leftId = getShort(data, Column.LeftId); entry.rightId = getShort(data, Column.RightId); entry.cost = getShort(data, Column.Cost); entry.reading = get(data, Column.ReadingForm, true); - WordRef normalizedForm = normRefParser.parse(get(data, Column.NormalizedForm, false)); + WordRef normalizedForm = getWordRef(data, Column.NormalizedForm, normRefParser); if (normalizedForm instanceof WordRef.Headword && ((WordRef.Headword) normalizedForm).getHeadword().equals(entry.headword)) { // mark as self-reference (headword ref may point different entry) @@ -205,7 +226,7 @@ private RawWordEntry convertEntry(List data) { } else { entry.normalizedForm = normalizedForm; } - entry.dictionaryForm = dictRefParser.parse(get(data, Column.DictionaryForm, false)); + entry.dictionaryForm = getWordRef(data, Column.DictionaryForm, dictRefParser); POS pos = new POS( // comment for line break @@ -222,8 +243,11 @@ private RawWordEntry convertEntry(List data) { entry.synonymGroups = getInts(data, Column.SynonymGroups); entry.userData = get(data, Column.UserData, true); - entry.validate(); - + try { + entry.validate(); + } catch (IllegalArgumentException e) { + throw new CsvFieldException(parser.getName(), parser.getRow(), "", e); + } return entry; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java index d2bb68c5..5691aecf 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java @@ -96,7 +96,7 @@ public String headword() { private void checkString(String value, String name) { if (value.length() > StringPtr.MAX_LENGTH) { - throw new CsvFieldException( + throw new IllegalArgumentException( String.format("field %s had value which exceeded the maximum length %d (actual length: %d)", name, StringPtr.MAX_LENGTH, value.length())); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java index 9d83dfac..045cb882 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java @@ -226,8 +226,7 @@ public WordRef parse(String text) { if (allowHeadword) { return new Headword(Unescape.unescape(text)); } else { - throw new CsvFieldException( - String.format("invalid word reference: %s, it must contain POS tag and reading", text)); + throw new IllegalArgumentException(String.format("invalid word reference: %s", text)); } } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/CSVParserTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/CSVParserTest.java similarity index 99% rename from src/test/java/com/worksap/nlp/sudachi/dictionary/CSVParserTest.java rename to src/test/java/com/worksap/nlp/sudachi/dictionary/build/CSVParserTest.java index ba63f5cd..84f0e647 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/CSVParserTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/CSVParserTest.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.worksap.nlp.sudachi.dictionary; +package com.worksap.nlp.sudachi.dictionary.build; import static org.hamcrest.Matchers.contains; import static org.hamcrest.MatcherAssert.assertThat; diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt index 337ed6ee..cee382cf 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt @@ -16,7 +16,6 @@ package com.worksap.nlp.sudachi.dictionary.build -import com.worksap.nlp.sudachi.dictionary.CSVParser import com.worksap.nlp.sudachi.dictionary.StringPtr import com.worksap.nlp.sudachi.resStream import java.io.StringReader From f6cdada0da1996bb5e142fe4e206686a100ee513 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Thu, 1 Aug 2024 17:16:58 +0900 Subject: [PATCH 54/94] mv stderr progress --- .../sudachi/dictionary/DictionaryBuilder.java | 38 +----------------- .../sudachi/dictionary/DictionaryPrinter.java | 3 +- .../dictionary/UserDictionaryBuilder.java | 3 +- .../sudachi/dictionary/build/Progress.java | 40 +++++++++++++++++++ 4 files changed, 44 insertions(+), 40 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilder.java index 3613a46f..715382e8 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilder.java @@ -94,8 +94,8 @@ public static void main(String[] args) throws IOException { List lexiconPaths = Arrays.asList(args).subList(i, args.length); - DicBuilder.System builder = DicBuilder.system().progress(new Progress(20, new StderrProgress())) - .matrix(Paths.get(matrixPath)).comment(description); + DicBuilder.System builder = DicBuilder.system().progress(Progress.syserr(20)).matrix(Paths.get(matrixPath)) + .comment(description); if (signature != null) { builder.signature(signature); @@ -111,38 +111,4 @@ public static void main(String[] args) throws IOException { } } - public static class StderrProgress implements Progress.Callback { - float last = 0; - String unit = "bytes"; - - @Override - public void start(String name, Progress.Kind kind) { - System.err.printf("%s\t", name); - last = 0; - switch (kind) { - case BYTE: - unit = "bytes"; - break; - case ENTRY: - unit = "entries"; - break; - } - } - - @Override - public void progress(float progress) { - while (last < progress) { - last += 0.05f; - System.err.print("."); - } - } - - static final double NANOS_PER_SECOND = 1000_000_000; - - @Override - public void end(long size, Duration time) { - double seconds = time.getSeconds() + time.getNano() / NANOS_PER_SECOND; - System.err.printf("\tDone! (%d %s, %.3f sec)%n", size, unit, seconds); - } - } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index 334eeccb..e6dc457c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -18,7 +18,6 @@ import com.worksap.nlp.sudachi.WordId; import com.worksap.nlp.sudachi.dictionary.build.Progress; -import com.worksap.nlp.sudachi.dictionary.DictionaryBuilder.StderrProgress; import com.worksap.nlp.sudachi.dictionary.build.RawLexiconReader.Column; import java.io.IOException; @@ -36,7 +35,7 @@ public class DictionaryPrinter { public final String WordRefJoinerStr = String.valueOf(WordRefJoiner); private final PrintStream output; - private final Progress progress = new Progress(20, new StderrProgress()); + private final Progress progress = Progress.syserr(20); private final GrammarImpl grammar; private final LexiconSet lex; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilder.java index 83d0cc0b..08515af1 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilder.java @@ -88,8 +88,7 @@ public static void main(String[] args) throws IOException { List lexiconPaths = Arrays.asList(args).subList(i, args.length); try (BinaryDictionary system = new BinaryDictionary(sysDictPath)) { - DicBuilder.User builder = DicBuilder.user() - .progress(new Progress(20, new DictionaryBuilder.StderrProgress())).system(system) + DicBuilder.User builder = DicBuilder.user().progress(Progress.syserr(20)).system(system) .comment(description); for (String lexicon : lexiconPaths) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java index d7ef6ed4..becc546a 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java @@ -37,6 +37,11 @@ public class Progress { public static final Progress NOOP = new Progress(1, progress -> { }); + /** Progress with stderr. */ + public static final Progress syserr(int maxUpdates) { + return new Progress(maxUpdates, new StderrProgress()); + } + public Progress(int maxUpdates, Callback callback) { this.maxUpdates = maxUpdates; this.callback = callback; @@ -140,4 +145,39 @@ default void start(String name, Kind kind) { default void end(long size, Duration time) { } } + + public static class StderrProgress implements Callback { + float last = 0; + String unit = "bytes"; + + @Override + public void start(String name, Progress.Kind kind) { + System.err.printf("%s\t", name); + last = 0; + switch (kind) { + case BYTE: + unit = "bytes"; + break; + case ENTRY: + unit = "entries"; + break; + } + } + + @Override + public void progress(float progress) { + while (last < progress) { + last += 0.05f; + System.err.print("."); + } + } + + static final double NANOS_PER_SECOND = 1000_000_000; + + @Override + public void end(long size, Duration time) { + double seconds = time.getSeconds() + time.getNano() / NANOS_PER_SECOND; + System.err.printf("\tDone! (%d %s, %.3f sec)%n", size, unit, seconds); + } + } } From b8121a611faf2381cf7f398e5af0e56e0e66d6fe Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 2 Aug 2024 10:00:54 +0900 Subject: [PATCH 55/94] add pos table printer --- .../nlp/sudachi/SudachiCommandLine.java | 4 +- .../dictionary/DictionaryGrammarPrinter.java | 110 ++++++++++++++++++ .../sudachi/dictionary/DictionaryPrinter.java | 10 +- 3 files changed, 120 insertions(+), 4 deletions(-) create mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinter.java diff --git a/src/main/java/com/worksap/nlp/sudachi/SudachiCommandLine.java b/src/main/java/com/worksap/nlp/sudachi/SudachiCommandLine.java index 23f91ad6..2912ad5f 100644 --- a/src/main/java/com/worksap/nlp/sudachi/SudachiCommandLine.java +++ b/src/main/java/com/worksap/nlp/sudachi/SudachiCommandLine.java @@ -268,11 +268,11 @@ public static void main(String[] args) throws IOException { return; } else if (args[i].equals("--userDict")) { Path resolved = anchor.resolve(args[++i]); - logger.fine(() -> "using system dict: " + resolved); + logger.fine(() -> "using user dict: " + resolved); additional = additional.addUserDictionary(resolved); } else if (args[i].equals("--systemDict")) { Path resolved = anchor.resolve(args[++i]); - logger.fine(() -> "using user dict: " + resolved); + logger.fine(() -> "using system dict: " + resolved); additional = additional.systemDictionary(resolved); } else if (args[i].equals("--format")) { formatterKind = args[++i]; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinter.java new file mode 100644 index 00000000..482df5cb --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinter.java @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2021 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary; + +import java.io.Console; +import java.io.IOException; +import java.io.PrintStream; +import java.nio.file.Path; +import java.nio.file.Paths; + +import com.worksap.nlp.sudachi.PathAnchor; +import com.worksap.nlp.sudachi.Config; +import com.worksap.nlp.sudachi.DictionaryFactory; +import com.worksap.nlp.sudachi.Settings; + +/** + * A dictionary grammar printing tool. + */ +public class DictionaryGrammarPrinter { + private DictionaryGrammarPrinter() { + } + + static void printUsage() { + Console console = System.console(); + console.printf("usage: DictionaryGrammarPrinter [-r file] \n"); + console.printf("\t-r file\tread settings from file (overrides -s)\n"); + console.printf("\t-s string\tadditional settings (overrides -r)\n"); + console.printf("\t-p directory\troot directory of resources\n"); + console.printf("\t--systemDict file\tpath to a system dictionary (overrides everything)\n"); + console.printf("\t-u file\tpath to an additional user dictionary (appended to -s)\n"); + } + + static void printPos(GrammarImpl grammar, PrintStream output) throws IOException { + int numPos = grammar.getPartOfSpeechSize(); + for (int i = 0; i < numPos; i++) { + POS pos = grammar.getPartOfSpeechString((short) i); + output.println(pos.toString()); + } + } + + /** + * Prints the contents of dictionary grammar. + * + * Specify the target dictionary in the same way to SudachiCommandline. + * Currently it can only print POS table. + * + * @param args + * the input filenames + * @throws IOException + * if IO fails + */ + public static void main(String[] args) throws IOException { + PathAnchor anchor = PathAnchor.classpath().andThen(PathAnchor.none()); + Settings current = Settings.resolvedBy(anchor) + .read(DictionaryGrammarPrinter.class.getClassLoader().getResource("sudachi.json")); + Config additional = Config.empty(); + + int i; + for (i = 0; i < args.length; i++) { + if (args[i].equals("-h")) { + printUsage(); + return; + } else if (args[i].equals("-r") && i + 1 < args.length) { + Path configPath = Paths.get(args[++i]); + Path parent = configPath.getParent(); + if (parent == null) { // parent directory of file.txt unfortunately is null :( + parent = Paths.get(""); + } + PathAnchor curAnchor = PathAnchor.filesystem(parent).andThen(PathAnchor.classpath()); + additional = Config.fromFile(configPath, curAnchor).withFallback(additional); + } else if (args[i].equals("-p") && i + 1 < args.length) { + String resourcesDirectory = args[++i]; + anchor = PathAnchor.filesystem(Paths.get(resourcesDirectory)).andThen(PathAnchor.classpath()); + // first resolve wrt new directory + current = Settings.resolvedBy(anchor).withFallback(current); + } else if (args[i].equals("-s") && i + 1 < args.length) { + Config other = Config.fromJsonString(args[++i], anchor); + additional = other.withFallback(additional); + } else if (args[i].equals("-u")) { + Path resolved = anchor.resolve(args[++i]); + additional = additional.addUserDictionary(resolved); + } else if (args[i].equals("--systemDict")) { + Path resolved = anchor.resolve(args[++i]); + additional = additional.systemDictionary(resolved); + } else { + break; + } + } + + Config config = additional.withFallback(Config.fromSettings(current)); + DictionaryAccess dict = (DictionaryAccess) new DictionaryFactory().create(config); + GrammarImpl grammar = dict.getGrammar(); + + printPos(grammar, System.out); + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index e6dc457c..bce6fc73 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -20,6 +20,7 @@ import com.worksap.nlp.sudachi.dictionary.build.Progress; import com.worksap.nlp.sudachi.dictionary.build.RawLexiconReader.Column; +import java.io.Console; import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; @@ -69,6 +70,12 @@ private DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictio wordIds = allIds; } + static void printUsage() { + Console console = System.console(); + console.printf("usage: PrintDictionary [-s file] file\n"); + console.printf("\t-s file\tsystem dictionary\n"); + } + void printHeader() { // @formatter:off printColumnHeaders(Column.Surface, Column.LeftId, Column.RightId, Column.Cost, Column.Pos1, Column.Pos2, @@ -306,8 +313,7 @@ public static void main(String[] args) throws IOException { if (args[i].equals("-s") && i + 1 < args.length) { systemDict = BinaryDictionary.loadSystem(args[++i]); } else if (args[i].equals("-h")) { - System.err.println("usage: PrintDictionary [-s file] file"); - System.err.println("\t-s file\tsystem dictionary"); + printUsage(); return; } else { break; From 1f3f350fc1598d60c15e1a3739defcf6302046d6 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 2 Aug 2024 10:57:05 +0900 Subject: [PATCH 56/94] enable dict builder to load pos from file --- .../sudachi/dictionary/DictionaryBuilder.java | 10 ++- .../sudachi/dictionary/build/DicBuilder.java | 76 ++++++++++++++----- .../sudachi/dictionary/build/POSTable.java | 63 ++++++++++++--- 3 files changed, 117 insertions(+), 32 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilder.java index 715382e8..f4020e2d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilder.java @@ -22,7 +22,6 @@ import java.nio.file.Files; import java.nio.file.Paths; import java.nio.file.StandardOpenOption; -import java.time.Duration; import java.util.Arrays; import java.util.List; @@ -42,6 +41,7 @@ static void printUsage() { console.printf("\t-o file\toutput to file\n"); console.printf("\t-m file\tmatrix file\n"); console.printf("\t-d description\tcomment\n"); + console.printf("\t-p file\tpos file (optional)\n"); console.printf("\t-s signature\tsignature\n"); } @@ -67,6 +67,7 @@ public static void main(String[] args) throws IOException { String description = ""; String outputPath = null; String matrixPath = null; + String posPath = null; String signature = null; int i; @@ -75,6 +76,8 @@ public static void main(String[] args) throws IOException { outputPath = args[++i]; } else if (args[i].equals("-m") && i + 1 < args.length) { matrixPath = args[++i]; + } else if (args[i].equals("-p") && i + 1 < args.length) { + posPath = args[++i]; } else if (args[i].equals("-d") && i + 1 < args.length) { description = args[++i]; } else if (args[i].equals("-s")) { @@ -96,11 +99,12 @@ public static void main(String[] args) throws IOException { DicBuilder.System builder = DicBuilder.system().progress(Progress.syserr(20)).matrix(Paths.get(matrixPath)) .comment(description); - + if (posPath != null) { + builder = builder.posTable(Paths.get(posPath)); + } if (signature != null) { builder.signature(signature); } - for (String lexiconPath : lexiconPaths) { builder = builder.lexicon(Paths.get(lexiconPath)); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java index 864d3107..cab76f08 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java @@ -49,17 +49,62 @@ private DicBuilder() { } private static class Base> { - protected final POSTable pos = new POSTable(); - protected final ConnectionMatrix connection = new ConnectionMatrix(); protected Progress progress = Progress.NOOP; - protected RawLexicon lexicon = new RawLexicon(); protected final Description description = new Description(); + protected final ConnectionMatrix connection = new ConnectionMatrix(); + protected final POSTable pos = new POSTable(); + protected final RawLexicon lexicon = new RawLexicon(); @SuppressWarnings("unchecked") private T self() { return (T) this; } + /** + * Set the progress handler to the provided one. + * + * @param progress + * progress handler + * @return current object + */ + public T progress(Progress progress) { + this.progress = Objects.requireNonNull(progress); + return self(); + } + + /** + * Read POS list from the csv file. + */ + public T posTable(String name, IOSupplier input, long size) throws IOException { + progress.startBlock(name, nanoTime(), Progress.Kind.ENTRY); + int nRead; + try (InputStream is = input.get()) { + InputStream stream = new ProgressInputStream(is, size, progress); + nRead = pos.readEntries(stream); + } + progress.endBlock(nRead, nanoTime()); + return self(); + } + + /** + * Read POS list from the csv file. + */ + public T posTable(URL url) throws IOException { + String name = url.getPath(); + URLConnection conn = url.openConnection(); + long size = conn.getContentLengthLong(); + return posTable(name, conn::getInputStream, size); + } + + /** + * Read POS list from the csv file. + */ + public T posTable(Path path) throws IOException { + String name = path.getFileName().toString(); + long size = Files.size(path); + return posTable(name, () -> Files.newInputStream(path), size); + } + /** * Import words from the csv lexicon into the binary dictionary compiler. * @@ -123,18 +168,6 @@ public T lexicon(Path path) throws IOException { return lexicon(name, () -> Files.newInputStream(path), size); } - /** - * Set the progress handler to the provided one - * - * @param progress - * handler - * @return current object - */ - public T progress(Progress progress) { - this.progress = Objects.requireNonNull(progress); - return self(); - } - /** * Set the comment string in the binary dictionary * @@ -188,6 +221,9 @@ public void build(SeekableByteChannel channel) throws IOException { * Instanciate via SystemNoMatrix. */ public static final class System extends Base { + /** + * Read connection matrix from MeCab matrix.def format text file. + */ private System readMatrix(String name, IOSupplier input, long size) throws IOException { progress.startBlock(name, nanoTime(), Progress.Kind.BYTE); try (InputStream is = input.get()) { @@ -285,7 +321,7 @@ public DicBuilder.System matrix(Path path) throws IOException { * Set the progress handler to the provided one * * @param progress - * handler + * progress handler * @return current object */ public SystemNoMatrix progress(Progress progress) { @@ -297,7 +333,7 @@ public SystemNoMatrix progress(Progress progress) { /** * Create a new system dictionary compiler * - * @return new dictionary compiler object + * @return new system dictionary compiler object */ public static SystemNoMatrix system() { return new SystemNoMatrix(new System()); @@ -309,6 +345,10 @@ public static SystemNoMatrix system() { * Instanciate via UserNoSystem. */ public static final class User extends Base { + /** + * Preload data from given system dictionary. + * + */ public User system(BinaryDictionary system) { progress.startBlock("system dict entries", nanoTime(), Progress.Kind.ENTRY); int nread = lexicon.preloadFrom(system.getLexicon(), progress); @@ -350,7 +390,7 @@ public DicBuilder.User system(BinaryDictionary system) { * Set the progress handler to the provided one * * @param progress - * handler + * progress handler * @return current object */ public UserNoSystem progress(Progress progress) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java index 3323e485..1867f674 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java @@ -19,7 +19,8 @@ import com.worksap.nlp.sudachi.dictionary.Grammar; import com.worksap.nlp.sudachi.dictionary.POS; -import java.io.IOException; +import java.io.*; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -28,8 +29,11 @@ * Dictionary parts: List of part-of-speeches. */ public class POSTable { + final static int MAX_POS_NUMBER = Short.MAX_VALUE; + private final List table = new ArrayList<>(); private final HashMap lookup = new HashMap<>(); + // number of pos loaded from the system dictionary. private int builtin = 0; /** @@ -41,7 +45,7 @@ public class POSTable { short getId(POS s) { return lookup.computeIfAbsent(s, p -> { int next = table.size(); - if (next >= Short.MAX_VALUE) { + if (next >= MAX_POS_NUMBER) { throw new IllegalArgumentException("maximum POS number exceeded by " + s); } table.add(s); @@ -49,13 +53,26 @@ short getId(POS s) { }); } + /** @return full POS list that contains builtin and newly added POSs */ + List getList() { + return table; + } + + /** + * @return number of non-builtin POSs. + */ + public int ownedLength() { + return table.size() - builtin; + } + /** * Load pos table from the grammar (of the system dictionary). They are * considered as built-in pos. * * @param grammar + * @return number read. */ - public void preloadFrom(Grammar grammar) { + public int preloadFrom(Grammar grammar) { int partOfSpeechSize = grammar.getPartOfSpeechSize(); for (short i = 0; i < partOfSpeechSize; ++i) { POS pos = grammar.getPartOfSpeechString(i); @@ -63,18 +80,42 @@ public void preloadFrom(Grammar grammar) { lookup.put(pos, i); } builtin += partOfSpeechSize; - } - - /** @return full POS list that contains builtin and newly added POSs */ - List getList() { - return table; + return partOfSpeechSize; } /** - * @return number of non-builtin POSs. + * Load pos table from the text format. + * + * Assume 6-column csv without header. + * + * @param data + * @return number read. */ - public int ownedLength() { - return table.size() - builtin; + public int readEntries(InputStream data) throws IOException { + LineNumberReader reader = new LineNumberReader(new InputStreamReader(data, StandardCharsets.UTF_8)); + + int baseSize = table.size(); + int numLines = 0; + while (true) { + String line = reader.readLine(); + if (line == null) { + break; + } + + String[] cols = line.split(","); + if (cols.length != 6) { + throw new InputFileException(numLines, line, + new IllegalArgumentException("each POS must have 6 columns.")); + } + + int posid = getId(new POS(cols)); + if (posid != baseSize + numLines) { + throw new InputFileException(numLines, line, + new IllegalArgumentException(String.format("POS already exists (%s).", posid))); + } + numLines += 1; + } + return numLines; } /** From 4c5b3f0f9d23c70675db8758219aba1d5f09cf61 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 2 Aug 2024 16:55:40 +0900 Subject: [PATCH 57/94] let lexcon csv have either (or both) pos-id or pos-parts column. --- .../dictionary/build/RawLexiconReader.java | 91 ++++++++++++++----- 1 file changed, 68 insertions(+), 23 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index 8dfbbe85..b43f99ce 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -35,10 +35,10 @@ public class RawLexiconReader { * reordered with respect to the header. */ public enum Column { - Surface(true), LeftId(true), RightId(true), Cost(true), Writing(false), Pos1(true), Pos2(true), Pos3( - true), Pos4(true), Pos5(true), Pos6(true), ReadingForm(true), NormalizedForm(true), DictionaryForm( - true), Mode(false), SplitA(true), SplitB( - true), WordStructure(true), SynonymGroups(false), SplitC(false), UserData(false); + Surface(true), LeftId(true), RightId(true), Cost(true), Writing(false), Pos1(false), Pos2(false), Pos3( + false), Pos4(false), Pos5(false), Pos6(false), ReadingForm(true), NormalizedForm(true), DictionaryForm( + true), Mode(false), SplitA(true), SplitB(true), WordStructure( + true), SynonymGroups(false), SplitC(false), UserData(false), PosId(false); private final boolean required; @@ -54,6 +54,8 @@ public enum Column { private final WordRef.Parser normRefParser; // for normalized form private final WordRef.Parser dictRefParser; // for dictionary form private final WordRef.Parser splitParser; // for splits + private boolean posIdExists = false; + private boolean posStrExists = true; public RawLexiconReader(CSVParser parser, POSTable pos, boolean user) throws IOException { this.parser = parser; @@ -93,7 +95,7 @@ private void resolveColumnLayout() throws IOException { outer: for (int fieldId = 0; fieldId < record.size(); ++fieldId) { String field = record.get(fieldId).replaceAll("_", ""); - for (int colId = 0; colId < record.size(); ++colId) { + for (int colId = 0; colId < remaining.size(); ++colId) { Column col = remaining.get(colId); if (col.name().equalsIgnoreCase(field)) { mapping[col.ordinal()] = fieldId; @@ -113,6 +115,20 @@ private void resolveColumnLayout() throws IOException { } } + this.posIdExists = mapping[Column.PosId.ordinal()] >= 0; + long numPosColumnsFound = Arrays + .asList(Column.Pos1, Column.Pos2, Column.Pos3, Column.Pos4, Column.Pos5, Column.Pos6).stream() + .filter(c -> mapping[c.ordinal()] >= 0).count(); + if (numPosColumnsFound != 0 && numPosColumnsFound != POS.DEPTH) { + throw new CsvFieldException(parser.getName(), 0, "POS", + new IllegalArgumentException("Pos1 ~ Pos6 columns must appear as a set.")); + } + this.posStrExists = numPosColumnsFound == POS.DEPTH; + if (!posIdExists && !posStrExists) { + throw new CsvFieldException(parser.getName(), 0, "POS", + new IllegalArgumentException("Both or either PosId column or Pos1~Pos6 columns are required.")); + } + this.mapping = mapping; } @@ -198,14 +214,54 @@ private List getWordRefs(List data, Column column, WordRef.Pars return result; } - /** parse specified column as WordRef. */ - private WordRef getWordRef(List data, Column column, WordRef.Parser refParser) { + /** parse specified column as WordRef, also checks self-reference. */ + private WordRef getWordRef(List data, Column column, WordRef.Parser refParser, RawWordEntry entry) { String value = get(data, column, false); + WordRef ref; try { - return refParser.parse(value); + ref = refParser.parse(value); } catch (IllegalArgumentException e) { throw new CsvFieldException(parser.getName(), parser.getRow(), column.name(), e); } + + // if parsed ref seems to refering current entry, return self-reference (null), + // because headword/triple ref may resolved to other entry. + if (ref instanceof WordRef.Headword) { + WordRef.Headword headword = (WordRef.Headword) ref; + if (headword.getHeadword().equals(entry.headword)) { + return null; + } + } else if (ref instanceof WordRef.Triple) { + WordRef.Triple triple = (WordRef.Triple) ref; + if (triple.getHeadword().equals(entry.headword) && triple.getPosId() == entry.posId + && triple.getReading().equals(entry.reading)) { + return null; + } + } + return ref; + } + + /** parse POS columns. */ + private short getPos(List data) { + short posId = -1; + short posStrId = -1; + + if (this.posIdExists) { + posId = getShort(data, Column.PosId); + } + if (this.posStrExists) { + POS pos = new POS( + // comment for line break + get(data, Column.Pos1, true), get(data, Column.Pos2, true), get(data, Column.Pos3, true), + get(data, Column.Pos4, true), get(data, Column.Pos5, true), get(data, Column.Pos6, true)); + posStrId = posTable.getId(pos); + } + if (this.posIdExists && this.posStrExists && posId != posStrId) { + throw new CsvFieldException(parser.getName(), parser.getRow(), "POS", new IllegalArgumentException( + String.format("PosId (%d) and id from Pos1-6 (%d) does not match.", posId, posStrId))); + } + + return this.posIdExists ? posId : posStrId; } /** convert csv row to RawWordEntry */ @@ -218,22 +274,11 @@ private RawWordEntry convertEntry(List data) { entry.cost = getShort(data, Column.Cost); entry.reading = get(data, Column.ReadingForm, true); - WordRef normalizedForm = getWordRef(data, Column.NormalizedForm, normRefParser); - if (normalizedForm instanceof WordRef.Headword - && ((WordRef.Headword) normalizedForm).getHeadword().equals(entry.headword)) { - // mark as self-reference (headword ref may point different entry) - entry.normalizedForm = null; - } else { - entry.normalizedForm = normalizedForm; - } - entry.dictionaryForm = getWordRef(data, Column.DictionaryForm, dictRefParser); - - POS pos = new POS( - // comment for line break - get(data, Column.Pos1, true), get(data, Column.Pos2, true), get(data, Column.Pos3, true), - get(data, Column.Pos4, true), get(data, Column.Pos5, true), get(data, Column.Pos6, true)); + entry.posId = getPos(data); - entry.posId = posTable.getId(pos); + // headword, pos, reading must be parsed before these. + entry.normalizedForm = getWordRef(data, Column.NormalizedForm, normRefParser, entry); + entry.dictionaryForm = getWordRef(data, Column.DictionaryForm, dictRefParser, entry); entry.mode = get(data, Column.Mode, false); entry.aUnitSplit = getWordRefs(data, Column.SplitA, splitParser); From e0b92625154c956f58b5e6ac48fa6a4569fb7b0f Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 2 Aug 2024 18:42:56 +0900 Subject: [PATCH 58/94] try to satisfy sonarcloud --- .../nlp/sudachi/JoinKatakanaOovPlugin.java | 44 ++++---- .../com/worksap/nlp/sudachi/LatticeImpl.java | 6 +- .../nlp/sudachi/MeCabOovProviderPlugin.java | 1 - .../nlp/sudachi/OovProviderPlugin.java | 5 +- .../nlp/sudachi/PathRewritePlugin.java | 1 - .../nlp/sudachi/SimpleOovProviderPlugin.java | 1 - .../com/worksap/nlp/sudachi/StringUtil.java | 7 +- .../sudachi/dictionary/BinaryDictionary.java | 2 +- .../nlp/sudachi/dictionary/Blocks.java | 3 + .../nlp/sudachi/dictionary/Description.java | 8 +- .../sudachi/dictionary/DictionaryBuilder.java | 38 ++++--- .../dictionary/DictionaryGrammarPrinter.java | 10 +- .../sudachi/dictionary/DictionaryPrinter.java | 36 +++--- .../worksap/nlp/sudachi/dictionary/Ints.java | 4 +- .../nlp/sudachi/dictionary/StringPtr.java | 3 +- .../nlp/sudachi/dictionary/WordIdTable.java | 9 +- .../nlp/sudachi/dictionary/WordInfo.java | 4 +- .../nlp/sudachi/dictionary/build/Align.java | 9 +- .../sudachi/dictionary/build/BlockLayout.java | 4 +- .../sudachi/dictionary/build/CSVParser.java | 16 +-- .../dictionary/build/ConnectionMatrix.java | 2 +- .../sudachi/dictionary/build/DicBuilder.java | 4 +- .../nlp/sudachi/dictionary/build/Index.java | 6 +- .../sudachi/dictionary/build/POSTable.java | 4 +- .../sudachi/dictionary/build/Progress.java | 9 +- .../sudachi/dictionary/build/RawLexicon.java | 12 +- .../dictionary/build/RawLexiconReader.java | 106 +++++++++--------- .../dictionary/build/StringStorage.java | 8 +- .../sudachi/dictionary/build/Unescape.java | 3 + .../build/UnicodeBufferResizeable.java | 2 - .../sudachi/dictionary/build/WordLayout.java | 51 ++++++--- .../nlp/sudachi/dictionary/build/WordRef.java | 17 +++ .../dictionary/DictionaryPrinterTest.kt | 4 +- .../dictionary/build/CSVParserTest.java | 66 +++++------ 34 files changed, 280 insertions(+), 225 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java b/src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java index d0451973..cef00ef1 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java @@ -75,31 +75,33 @@ public void setOovFactory(short oovPosId) { public void rewrite(InputText text, List path, Lattice lattice) { for (int i = 0; i < path.size(); i++) { LatticeNode node = path.get(i); - if ((node.isOOV() || isShorter(minLength, text, node)) && isKatakanaNode(text, node)) { - int begin = i - 1; - for (; begin >= 0; begin--) { - if (!isKatakanaNode(text, path.get(begin))) { - begin++; - break; - } - } - if (begin < 0) { - begin = 0; - } - int end = i + 1; - for (; end < path.size(); end++) { - if (!isKatakanaNode(text, path.get(end))) { - break; - } - } - while (begin != end && !canOovBowNode(text, path.get(begin))) { + if ((!node.isOOV() && !isShorter(minLength, text, node)) || !isKatakanaNode(text, node)) { + continue; + } + + int begin = i - 1; + for (; begin >= 0; begin--) { + if (!isKatakanaNode(text, path.get(begin))) { begin++; + break; } - if (end - begin > 1) { - concatenateOov(path, begin, end, factory, lattice); - i = begin + 1; + } + if (begin < 0) { + begin = 0; + } + int end = i + 1; + for (; end < path.size(); end++) { + if (!isKatakanaNode(text, path.get(end))) { + break; } } + while (begin != end && !canOovBowNode(text, path.get(begin))) { + begin++; + } + if (end - begin > 1) { + concatenateOov(path, begin, end, factory, lattice); + i = begin + 1; + } } } diff --git a/src/main/java/com/worksap/nlp/sudachi/LatticeImpl.java b/src/main/java/com/worksap/nlp/sudachi/LatticeImpl.java index 60dd6498..73a5624c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/LatticeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/LatticeImpl.java @@ -98,10 +98,8 @@ public LatticeNodeImpl getMinimumNode(int begin, int end) { ArrayList ends = endLists.get(end); LatticeNodeImpl result = null; for (LatticeNodeImpl node : ends) { - if (node.begin == begin) { - if (result == null || result.totalCost >= node.cost) { - result = node; - } + if (node.begin == begin && (result == null || result.totalCost >= node.cost)) { + result = node; } } return result; diff --git a/src/main/java/com/worksap/nlp/sudachi/MeCabOovProviderPlugin.java b/src/main/java/com/worksap/nlp/sudachi/MeCabOovProviderPlugin.java index 7b8dad6a..59647981 100644 --- a/src/main/java/com/worksap/nlp/sudachi/MeCabOovProviderPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/MeCabOovProviderPlugin.java @@ -19,7 +19,6 @@ import com.worksap.nlp.sudachi.dictionary.CategoryType; import com.worksap.nlp.sudachi.dictionary.Grammar; import com.worksap.nlp.sudachi.dictionary.POS; -import com.worksap.nlp.sudachi.dictionary.WordInfo; import java.io.IOException; import java.io.InputStream; diff --git a/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java b/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java index bd091493..a74dcf70 100644 --- a/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java @@ -92,9 +92,10 @@ int getOOV(UTF8InputText inputText, int offset, long otherWords, List lexiconPaths, Path outputPath) throws IOException { + DicBuilder.System builder = DicBuilder.system().progress(Progress.syserr(20)).matrix(matrixPath) + .comment(description); + if (posPath != null) { + builder = builder.posTable(Paths.get(posPath)); + } + if (signature != null) { + builder = builder.signature(signature); + } + for (String lexiconPath : lexiconPaths) { + builder = builder.lexicon(Paths.get(lexiconPath)); + } + + try (SeekableByteChannel ch = Files.newByteChannel(outputPath, StandardOpenOption.WRITE, + StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) { + builder.build(ch); + } + } + /** * Builds the system dictionary. *

@@ -97,22 +118,7 @@ public static void main(String[] args) throws IOException { List lexiconPaths = Arrays.asList(args).subList(i, args.length); - DicBuilder.System builder = DicBuilder.system().progress(Progress.syserr(20)).matrix(Paths.get(matrixPath)) - .comment(description); - if (posPath != null) { - builder = builder.posTable(Paths.get(posPath)); - } - if (signature != null) { - builder.signature(signature); - } - for (String lexiconPath : lexiconPaths) { - builder = builder.lexicon(Paths.get(lexiconPath)); - } - - try (SeekableByteChannel ch = Files.newByteChannel(Paths.get(outputPath), StandardOpenOption.WRITE, - StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) { - builder.build(ch); - } + build(Paths.get(matrixPath), description, posPath, signature, lexiconPaths, Paths.get(outputPath)); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinter.java index 482df5cb..c875f997 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinter.java @@ -25,6 +25,7 @@ import com.worksap.nlp.sudachi.PathAnchor; import com.worksap.nlp.sudachi.Config; import com.worksap.nlp.sudachi.DictionaryFactory; +import com.worksap.nlp.sudachi.Dictionary; import com.worksap.nlp.sudachi.Settings; /** @@ -44,7 +45,7 @@ static void printUsage() { console.printf("\t-u file\tpath to an additional user dictionary (appended to -s)\n"); } - static void printPos(GrammarImpl grammar, PrintStream output) throws IOException { + static void printPos(GrammarImpl grammar, PrintStream output) { int numPos = grammar.getPartOfSpeechSize(); for (int i = 0; i < numPos; i++) { POS pos = grammar.getPartOfSpeechString((short) i); @@ -102,9 +103,10 @@ public static void main(String[] args) throws IOException { } Config config = additional.withFallback(Config.fromSettings(current)); - DictionaryAccess dict = (DictionaryAccess) new DictionaryFactory().create(config); - GrammarImpl grammar = dict.getGrammar(); - printPos(grammar, System.out); + try (Dictionary dict = new DictionaryFactory().create(config)) { + GrammarImpl grammar = ((DictionaryAccess) dict).getGrammar(); + printPos(grammar, System.out); + } } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index bce6fc73..a43a6f43 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -30,10 +30,10 @@ import java.util.stream.Collectors; public class DictionaryPrinter { - public final char WordRefDelimiter = '/'; - public final String WordRefDelimiterStr = String.valueOf(WordRefDelimiter); - public final char WordRefJoiner = ','; - public final String WordRefJoinerStr = String.valueOf(WordRefJoiner); + public static final char wordRefDelimiter = '/'; + public static final String wordRefDelimiterStr = String.valueOf(wordRefDelimiter); + public static final char wordRefJoiner = ','; + public static final String wordRefJoinerStr = String.valueOf(wordRefJoiner); private final PrintStream output; private final Progress progress = Progress.syserr(20); @@ -78,10 +78,10 @@ static void printUsage() { void printHeader() { // @formatter:off - printColumnHeaders(Column.Surface, Column.LeftId, Column.RightId, Column.Cost, Column.Pos1, Column.Pos2, - Column.Pos3, Column.Pos4, Column.Pos5, Column.Pos6, Column.ReadingForm, Column.NormalizedForm, - Column.DictionaryForm, Column.SplitA, Column.SplitB, Column.SplitC, Column.WordStructure, - Column.SynonymGroups, Column.UserData); + printColumnHeaders(Column.SURFACE, Column.LEFT_ID, Column.RIGHT_ID, Column.COST, Column.POS1, Column.POS2, + Column.POS3, Column.POS4, Column.POS5, Column.POS6, Column.READING_FORM, Column.NORMALIZED_FORM, + Column.DICTIONARY_FORM, Column.SPLIT_A, Column.SPLIT_B, Column.SPLIT_C, Column.WORD_STRUCTURE, + Column.SYNONYM_GROUPS, Column.USER_DATA); // @formatter:on } @@ -176,10 +176,7 @@ String wordRef(int wordId) { parts.addAll(pos); parts.add(reading); - // escape special chars - String wordRefTriple = String.join(WordRefJoinerStr, - parts.stream().map(p -> maybeEscapeRefPart(p)).collect(Collectors.toList())); - return wordRefTriple; + return String.join(wordRefJoinerStr, parts.stream().map(this::maybeEscapeRefPart).collect(Collectors.toList())); } /** encode word entry pointed by the wordId as WordRef.Headword. */ @@ -189,17 +186,16 @@ String wordRefHeadword(int wordId, int reference) { } int dic = WordId.dic(wordId); WordInfo info = lex.getWordInfo(wordId); - String surface = lex.string(dic, info.getSurface()); - return surface; + return lex.string(dic, info.getSurface()); } String wordRefList(int[] wordIds) { - return String.join(WordRefDelimiterStr, - Arrays.stream(wordIds).boxed().map(wi -> wordRef(wi)).collect(Collectors.toList())); + return String.join(wordRefDelimiterStr, + Arrays.stream(wordIds).boxed().map(this::wordRef).collect(Collectors.toList())); } String intList(int[] ints) { - return String.join("/", Arrays.stream(ints).boxed().map(i -> i.toString()).collect(Collectors.toList())); + return String.join("/", Arrays.stream(ints).boxed().map(Object::toString).collect(Collectors.toList())); } private static boolean hasCh(String value, int ch) { @@ -221,12 +217,12 @@ private String maybeEscapeString(String value) { /** escape WordRef.Triple part. */ private String maybeEscapeRefPart(String value) { - boolean hasDelimiter = hasCh(value, WordRefDelimiter); - boolean hasJoiner = hasCh(value, WordRefJoiner); + boolean hasDelimiter = hasCh(value, wordRefDelimiter); + boolean hasJoiner = hasCh(value, wordRefJoiner); if (!hasDelimiter && !hasJoiner) { return value; } - return unicodeEscape(value, Arrays.asList(WordRefDelimiter, WordRefJoiner)); + return unicodeEscape(value, Arrays.asList(wordRefDelimiter, wordRefJoiner)); } /** escape specified chars as unicode codepoint */ diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java index f4a82c9f..5b362d62 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java @@ -40,7 +40,9 @@ public Ints(int capacity) { } public int get(int index) { - assert index < length; + if (index >= length) { + throw new IndexOutOfBoundsException(String.format("index %d is larger than Ints length %d", index, length)); + } return data[index]; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/StringPtr.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/StringPtr.java index 5c4b631d..a3cdc788 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/StringPtr.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/StringPtr.java @@ -152,8 +152,7 @@ public String toString() { public boolean isSubseqValid(int start, int end) { int realStart = offset + start; - int length = end - start; - return isValid(realStart, length); + return isValid(realStart, end - start); } public StringPtr subPtr(int start, int end) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java index e4fa18a2..b114afc3 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java @@ -19,6 +19,8 @@ import com.worksap.nlp.sudachi.WordId; import java.nio.ByteBuffer; +import java.nio.BufferUnderflowException; +import java.util.NoSuchElementException; import java.util.Iterator; class WordIdTable { @@ -99,7 +101,12 @@ public boolean hasNext() { @Override public Ints next() { BufReader r = buf; - int size = r.readVarint32(); + int size; + try { + size = r.readVarint32(); + } catch (BufferUnderflowException e) { + throw new NoSuchElementException(); + } ints.clear(); int[] data = ints.prepare(size); readDeltaCompressed(data, size, dicIdMask, r); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java index a16c0aba..8f6e94f4 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java @@ -249,9 +249,7 @@ public static int readingForm(ByteBuffer buffer, int pos) { // see dictionary.build.WordEntryLayout private WordInfo(ByteBuffer buffer, int pos) { - // short leftId = buffer.getShort(pos); - // short rightId = buffer.getShort(pos + 2); - // short cost = buffer.getShort(pos + 4); + // first 2*3 bytes are filled by word paramters. // do not modify buffer metadata for better performance posId = buffer.getShort(pos + 6); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Align.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Align.java index fb2feae7..bc11c29b 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Align.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Align.java @@ -57,8 +57,13 @@ public static int align(int value, int alignment) { * @return aligned value, it should be greater or equal than the passed value */ public static long align(long value, long alignment) { - assert isPowerOf2(alignment); - assert value >= 0; + if (!isPowerOf2(alignment)) { + throw new IllegalArgumentException( + String.format("alignment must be power of 2, but actualy: %d", alignment)); + } + if (value < 0) { + throw new IllegalArgumentException(String.format("value must not be negative, but actualy: %d", value)); + } // Compute alignment mask, it is the inverse of the mask for the bits that must // be 0 for alignment to be correct diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java index 9d228e24..c52ed86c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java @@ -34,12 +34,12 @@ public class BlockLayout { private final Progress progress; private final List info = new ArrayList<>(); - public BlockLayout(SeekableByteChannel channel, Progress progress) throws IOException { + public BlockLayout(SeekableByteChannel channel, Progress progress) { this.channel = channel; this.progress = progress; } - public BlockLayout(SeekableByteChannel channel) throws IOException { + public BlockLayout(SeekableByteChannel channel) { this.channel = channel; this.progress = Progress.NOOP; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CSVParser.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CSVParser.java index 839c226a..48af20af 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CSVParser.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CSVParser.java @@ -56,7 +56,7 @@ enum Type { private BufferedReader reader; private Deque tokenBuffer = new ArrayDeque<>(); private boolean hasNextField = false; - private int row = -1; + private int rowCount = -1; private String name = ""; @@ -69,22 +69,22 @@ public void close() throws IOException { reader.close(); } - public List getNextRecord() throws IOException { - List record = new ArrayList<>(); + public List getNextRow() throws IOException { + List row = new ArrayList<>(); hasNextField = false; String field; while ((field = getField()) != null) { if (field.equals("\n")) { - row += 1; - return record; + rowCount += 1; + return row; } - record.add(field); + row.add(field); } return null; } - public int getRow() { - return row; + public int getRowCount() { + return rowCount; } private String getField() throws IOException { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java index ab557865..227819d0 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java @@ -162,7 +162,7 @@ public boolean nonEmpty() { * @throws IOException */ public Void compile(BlockOutput out) throws IOException { - return out.measured("Connection Matrix", (p) -> { + return out.measured("Connection Matrix", p -> { out.getChannel().write(compiled.duplicate()); return null; }); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java index cab76f08..72aa403b 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java @@ -129,7 +129,7 @@ public T lexicon(String name, IOSupplier input, long size) throws I InputStream stream = new ProgressInputStream(is, size, progress); lexicon.read(name, stream, pos, numLeft, numRight); } - progress.endBlock(lexicon.getTotalEntries() - numEntryBefore, nanoTime()); + progress.endBlock((long) lexicon.getTotalEntries() - numEntryBefore, nanoTime()); return self(); } @@ -207,7 +207,7 @@ public void build(SeekableByteChannel channel) throws IOException { layout.block(Blocks.CONNECTION_MATRIX, connection::compile); } layout.block(Blocks.POS_TABLE, pos::compile); - lexicon.compile(pos, layout); + lexicon.compile(layout); description.setBlocks(layout.blocks()); description.setNumberOfEntries(lexicon.getIndexedEntries(), lexicon.getTotalEntries()); description.setRuntimeCosts(lexicon.hasRuntimeCosts()); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java index ee0a0a9f..28cc61e2 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java @@ -69,7 +69,7 @@ public int add(String key, int wordId) { * @throws IOException */ public void compile(BlockLayout layout, List notIndexed) throws IOException { - TrieData data = layout.block(Blocks.WORD_POINTERS, (o) -> writeWordTable(o, notIndexed)); + TrieData data = layout.block(Blocks.WORD_POINTERS, o -> writeWordTable(o, notIndexed)); layout.block(Blocks.TRIE_INDEX, data::writeTrie); } @@ -83,7 +83,7 @@ private TrieData writeWordTable(BlockOutput out, List n int nis = notIndexed.size(); int fullsize = size + nis; - out.measured("Word Id table", (p) -> { + out.measured("Word Id table", p -> { int i = 0; for (Map.Entry entry : this.elements.entrySet()) { keys[i] = entry.getKey(); @@ -142,7 +142,7 @@ public TrieData(byte[][] keys, int[] values) { * @throws IOException */ public Void writeTrie(BlockOutput block) throws IOException { - return block.measured("Trie Index", (p) -> { + return block.measured("Trie Index", p -> { DoubleArray trie = new DoubleArray(); trie.build(keys, values, p::progress); ByteBuffer buf = trie.byteArray().duplicate(); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java index 1867f674..17cfb496 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java @@ -29,7 +29,7 @@ * Dictionary parts: List of part-of-speeches. */ public class POSTable { - final static int MAX_POS_NUMBER = Short.MAX_VALUE; + static final int MAX_POS_NUMBER = Short.MAX_VALUE; private final List table = new ArrayList<>(); private final HashMap lookup = new HashMap<>(); @@ -126,7 +126,7 @@ public int readEntries(InputStream data) throws IOException { * @throws IOException */ public Void compile(BlockOutput out) throws IOException { - return out.measured("POS Table", (p) -> { + return out.measured("POS Table", p -> { BufferedChannel cbuf = new BufferedChannel(out.getChannel()); cbuf.byteBuffer(2).putShort((short) ownedLength()); for (int i = 0; i < ownedLength(); ++i) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java index becc546a..ea567341 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java @@ -23,7 +23,7 @@ */ public class Progress { // minimum time delta for callback.progress call - private final static long MS_100 = 100_000_000L; // 100ms in nanos + private static final long MS_100 = 100_000_000L; // 100ms in nanos // resolution of progress step. private final int maxUpdates; private final Callback callback; @@ -154,13 +154,10 @@ public static class StderrProgress implements Callback { public void start(String name, Progress.Kind kind) { System.err.printf("%s\t", name); last = 0; - switch (kind) { - case BYTE: + if (kind == Kind.BYTE) { unit = "bytes"; - break; - case ENTRY: + } else if (kind == Kind.ENTRY) { unit = "entries"; - break; } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index 60475099..12244560 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -107,7 +107,6 @@ public void read(String name, Reader data, POSTable posTable, short numLeft, sho parser.setName(name); RawLexiconReader reader = new RawLexiconReader(parser, posTable, isUser); - long offset = this.offset; RawWordEntry entry; while ((entry = reader.nextEntry()) != null) { if (entry.leftId >= numLeft || entry.rightId >= numRight) { @@ -127,7 +126,6 @@ public void read(String name, Reader data, POSTable posTable, short numLeft, sho } this.runtimeCosts |= !DoubleArrayLexicon.isNormalCost(entry.cost); } - this.offset = offset; } /** @@ -153,11 +151,10 @@ public void checkOffset(long offset) { /** * Write lexicon to the provided block layout. * - * @param pos * @param layout * @throws IOException */ - public void compile(POSTable pos, BlockLayout layout) throws IOException { + public void compile(BlockLayout layout) throws IOException { index.compile(layout, notIndexed); // entry layout requires stringstorage to be compiled beforehand. layout.block(Blocks.STRINGS, this::writeStrings); @@ -165,7 +162,7 @@ public void compile(POSTable pos, BlockLayout layout) throws IOException { } private Void writeStrings(BlockOutput blockOutput) throws IOException { - return blockOutput.measured("Strings", (p) -> { + return blockOutput.measured("Strings", p -> { strings.compile(p); strings.writeCompact(blockOutput.getChannel()); return null; @@ -173,7 +170,7 @@ private Void writeStrings(BlockOutput blockOutput) throws IOException { } private Void writeEntries(BlockOutput blockOutput) throws IOException { - return blockOutput.measured("Word Entries", (p) -> { + return blockOutput.measured("Word Entries", p -> { List list = entries; Lookup2 lookup = isUser ? new Lookup2(preloadedEntries, list) : new Lookup2(list, new ArrayList<>()); BufferedChannel buf = new BufferedChannel(blockOutput.getChannel(), WordEntryLayout.MAX_LENGTH * 4); @@ -214,7 +211,8 @@ private int addPhantomEntries(RawWordEntry entry, List list, Looku copy.reading = copy.headword; copy.posId = entry.posId; RawWordEntry last = list.get(list.size() - 1); - copy.pointer = RawLexicon.pointer(WordInfoList.wordId2offset(last.pointer) + last.computeExpectedSize()); + copy.pointer = RawLexicon + .pointer((long) WordInfoList.wordId2offset(last.pointer) + last.computeExpectedSize()); list.add(copy); lookup.add(copy, isUser); nPhantomEntries += 1; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index b43f99ce..d380408f 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -35,10 +35,10 @@ public class RawLexiconReader { * reordered with respect to the header. */ public enum Column { - Surface(true), LeftId(true), RightId(true), Cost(true), Writing(false), Pos1(false), Pos2(false), Pos3( - false), Pos4(false), Pos5(false), Pos6(false), ReadingForm(true), NormalizedForm(true), DictionaryForm( - true), Mode(false), SplitA(true), SplitB(true), WordStructure( - true), SynonymGroups(false), SplitC(false), UserData(false), PosId(false); + SURFACE(true), LEFT_ID(true), RIGHT_ID(true), COST(true), WRITING(false), POS1(false), POS2(false), POS3( + false), POS4(false), POS5(false), POS6(false), READING_FORM(true), NORMALIZED_FORM( + true), DICTIONARY_FORM(true), MODE(false), SPLIT_A(true), SPLIT_B(true), WORD_STRUCTURE( + true), SYNONYM_GROUPS(false), SPLIT_C(false), USER_DATA(false), POS_ID(false); private final boolean required; @@ -47,7 +47,7 @@ public enum Column { } } - private List cachedRecord; + private List cachedRow; private int[] mapping; private final CSVParser parser; private final POSTable posTable; @@ -81,30 +81,34 @@ private boolean isLegacyColumnLayout() { /** resolve header line and set to mapping if it exists. */ private void resolveColumnLayout() throws IOException { - List record = parser.getNextRecord(); + List row = parser.getNextRow(); - String leftId = record.get(Column.LeftId.ordinal()); + String leftId = row.get(Column.LEFT_ID.ordinal()); if (INTEGER_REGEX.matcher(leftId).matches()) { - this.cachedRecord = record; + this.cachedRow = row; return; } List remaining = new ArrayList<>(Arrays.asList(Column.values())); - int[] mapping = new int[remaining.size()]; + mapping = new int[remaining.size()]; Arrays.fill(mapping, -1); - outer: for (int fieldId = 0; fieldId < record.size(); ++fieldId) { - String field = record.get(fieldId).replaceAll("_", ""); + for (int fieldId = 0; fieldId < row.size(); ++fieldId) { + String field = row.get(fieldId).replace("_", ""); + boolean columnFound = false; for (int colId = 0; colId < remaining.size(); ++colId) { Column col = remaining.get(colId); - if (col.name().equalsIgnoreCase(field)) { + if (col.name().replace("_", "").equalsIgnoreCase(field)) { mapping[col.ordinal()] = fieldId; remaining.remove(colId); - continue outer; + columnFound = true; + break; } } - throw new CsvFieldException(parser.getName(), 0, field, - new IllegalArgumentException("Invalid column name")); + if (!columnFound) { + throw new CsvFieldException(parser.getName(), 0, field, + new IllegalArgumentException("Invalid column name")); + } } for (Column column : remaining) { @@ -115,9 +119,9 @@ private void resolveColumnLayout() throws IOException { } } - this.posIdExists = mapping[Column.PosId.ordinal()] >= 0; + this.posIdExists = mapping[Column.POS_ID.ordinal()] >= 0; long numPosColumnsFound = Arrays - .asList(Column.Pos1, Column.Pos2, Column.Pos3, Column.Pos4, Column.Pos5, Column.Pos6).stream() + .asList(Column.POS1, Column.POS2, Column.POS3, Column.POS4, Column.POS5, Column.POS6).stream() .filter(c -> mapping[c.ordinal()] >= 0).count(); if (numPosColumnsFound != 0 && numPosColumnsFound != POS.DEPTH) { throw new CsvFieldException(parser.getName(), 0, "POS", @@ -128,8 +132,6 @@ private void resolveColumnLayout() throws IOException { throw new CsvFieldException(parser.getName(), 0, "POS", new IllegalArgumentException("Both or either PosId column or Pos1~Pos6 columns are required.")); } - - this.mapping = mapping; } /** parse specified column as string */ @@ -140,7 +142,7 @@ private String get(List data, Column column, boolean unescape) { } if (index < 0 || index >= data.size()) { if (column.required) { - throw new CsvFieldException(parser.getName(), parser.getRow(), column.name(), + throw new CsvFieldException(parser.getName(), parser.getRowCount(), column.name(), new IllegalArgumentException(String.format("column [%s] was not present", column.name()))); } else { return ""; @@ -157,7 +159,7 @@ private String get(List data, Column column, boolean unescape) { private String getNonEmpty(List data, Column column, boolean unescape) { String value = get(data, column, unescape); if (value.isEmpty()) { - throw new CsvFieldException(parser.getName(), parser.getRow(), column.name(), + throw new CsvFieldException(parser.getName(), parser.getRowCount(), column.name(), new IllegalArgumentException(String.format("Column %s cannot be empty", column.name()))); } return value; @@ -169,7 +171,7 @@ private short getShort(List data, Column column) { try { return Short.parseShort(value); } catch (NumberFormatException e) { - throw new CsvFieldException(parser.getName(), parser.getRow(), column.name(), + throw new CsvFieldException(parser.getName(), parser.getRowCount(), column.name(), new IllegalArgumentException(String.format("failed to parse '%s' as a short value", value))); } } @@ -182,7 +184,7 @@ private Ints getInts(List data, Column column) { } String[] parts = value.split("/"); if (parts.length > Byte.MAX_VALUE) { - throw new CsvFieldException(parser.getName(), parser.getRow(), column.name(), + throw new CsvFieldException(parser.getName(), parser.getRowCount(), column.name(), new IllegalArgumentException("int list contained more than 127 entries: " + value)); } Ints result = new Ints(parts.length); @@ -200,7 +202,7 @@ private List getWordRefs(List data, Column column, WordRef.Pars } String[] parts = value.split("/"); if (parts.length > Byte.MAX_VALUE) { - throw new CsvFieldException(parser.getName(), parser.getRow(), column.name(), + throw new CsvFieldException(parser.getName(), parser.getRowCount(), column.name(), new IllegalArgumentException("reference list contained more than 127 entries: " + value)); } List result = new ArrayList<>(parts.length); @@ -208,7 +210,7 @@ private List getWordRefs(List data, Column column, WordRef.Pars try { result.add(refParser.parse(part)); } catch (IllegalArgumentException e) { - throw new CsvFieldException(parser.getName(), parser.getRow(), column.name(), e); + throw new CsvFieldException(parser.getName(), parser.getRowCount(), column.name(), e); } } return result; @@ -221,7 +223,7 @@ private WordRef getWordRef(List data, Column column, WordRef.Parser refP try { ref = refParser.parse(value); } catch (IllegalArgumentException e) { - throw new CsvFieldException(parser.getName(), parser.getRow(), column.name(), e); + throw new CsvFieldException(parser.getName(), parser.getRowCount(), column.name(), e); } // if parsed ref seems to refering current entry, return self-reference (null), @@ -247,17 +249,17 @@ private short getPos(List data) { short posStrId = -1; if (this.posIdExists) { - posId = getShort(data, Column.PosId); + posId = getShort(data, Column.POS_ID); } if (this.posStrExists) { POS pos = new POS( // comment for line break - get(data, Column.Pos1, true), get(data, Column.Pos2, true), get(data, Column.Pos3, true), - get(data, Column.Pos4, true), get(data, Column.Pos5, true), get(data, Column.Pos6, true)); + get(data, Column.POS1, true), get(data, Column.POS2, true), get(data, Column.POS3, true), + get(data, Column.POS4, true), get(data, Column.POS5, true), get(data, Column.POS6, true)); posStrId = posTable.getId(pos); } if (this.posIdExists && this.posStrExists && posId != posStrId) { - throw new CsvFieldException(parser.getName(), parser.getRow(), "POS", new IllegalArgumentException( + throw new CsvFieldException(parser.getName(), parser.getRowCount(), "POS", new IllegalArgumentException( String.format("PosId (%d) and id from Pos1-6 (%d) does not match.", posId, posStrId))); } @@ -267,48 +269,48 @@ private short getPos(List data) { /** convert csv row to RawWordEntry */ private RawWordEntry convertEntry(List data) { RawWordEntry entry = new RawWordEntry(); - entry.headword = getNonEmpty(data, Column.Surface, true); + entry.headword = getNonEmpty(data, Column.SURFACE, true); - entry.leftId = getShort(data, Column.LeftId); - entry.rightId = getShort(data, Column.RightId); - entry.cost = getShort(data, Column.Cost); + entry.leftId = getShort(data, Column.LEFT_ID); + entry.rightId = getShort(data, Column.RIGHT_ID); + entry.cost = getShort(data, Column.COST); - entry.reading = get(data, Column.ReadingForm, true); + entry.reading = get(data, Column.READING_FORM, true); entry.posId = getPos(data); // headword, pos, reading must be parsed before these. - entry.normalizedForm = getWordRef(data, Column.NormalizedForm, normRefParser, entry); - entry.dictionaryForm = getWordRef(data, Column.DictionaryForm, dictRefParser, entry); + entry.normalizedForm = getWordRef(data, Column.NORMALIZED_FORM, normRefParser, entry); + entry.dictionaryForm = getWordRef(data, Column.DICTIONARY_FORM, dictRefParser, entry); - entry.mode = get(data, Column.Mode, false); - entry.aUnitSplit = getWordRefs(data, Column.SplitA, splitParser); - entry.bUnitSplit = getWordRefs(data, Column.SplitB, splitParser); - entry.cUnitSplit = getWordRefs(data, Column.SplitC, splitParser); - entry.wordStructure = getWordRefs(data, Column.WordStructure, splitParser); - entry.synonymGroups = getInts(data, Column.SynonymGroups); - entry.userData = get(data, Column.UserData, true); + entry.mode = get(data, Column.MODE, false); + entry.aUnitSplit = getWordRefs(data, Column.SPLIT_A, splitParser); + entry.bUnitSplit = getWordRefs(data, Column.SPLIT_B, splitParser); + entry.cUnitSplit = getWordRefs(data, Column.SPLIT_C, splitParser); + entry.wordStructure = getWordRefs(data, Column.WORD_STRUCTURE, splitParser); + entry.synonymGroups = getInts(data, Column.SYNONYM_GROUPS); + entry.userData = get(data, Column.USER_DATA, true); try { entry.validate(); } catch (IllegalArgumentException e) { - throw new CsvFieldException(parser.getName(), parser.getRow(), "", e); + throw new CsvFieldException(parser.getName(), parser.getRowCount(), "", e); } return entry; } /** @return next entry parsed */ public RawWordEntry nextEntry() throws IOException { - List record = cachedRecord; - if (record != null) { - cachedRecord = null; + List row = cachedRow; + if (row != null) { + cachedRow = null; } else { - record = parser.getNextRecord(); + row = parser.getNextRow(); } - if (record == null) { + if (row == null) { return null; } - RawWordEntry entry = convertEntry(record); - entry.sourceLine = parser.getRow(); + RawWordEntry entry = convertEntry(row); + entry.sourceLine = parser.getRowCount(); entry.sourceName = parser.getName(); return entry; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java index 4d55187f..00a37064 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java @@ -109,10 +109,8 @@ private int computeOffsets(String str, int[] offsets) { int len = str.length(); for (int i = 0; i < len; ++i) { char ch = str.charAt(i); - if (Character.isLowSurrogate(ch)) { - if (i + 1 < len && Character.isHighSurrogate(str.charAt(i + 1))) { - i += 1; - } + if (Character.isLowSurrogate(ch) && i + 1 < len && Character.isHighSurrogate(str.charAt(i + 1))) { + i += 1; } offsets[count] = i; count += 1; @@ -128,7 +126,7 @@ public StringPtr resolve(String data) { } /** @return string hash map */ - public HashMap getStrings() { + public Map getStrings() { return strings; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Unescape.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Unescape.java index 1f157ac9..daf45e74 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Unescape.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Unescape.java @@ -25,6 +25,9 @@ public class Unescape { private static final Pattern unicodeLiteral = Pattern.compile("\\\\u(?:[0-9a-fA-F]{4}|\\{[0-9a-fA-F]+})"); + private Unescape() { + } + /** * Resolve unicode escape sequences in the string *

diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java index cac2b388..6b8d68f5 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java @@ -18,9 +18,7 @@ import java.io.IOException; import java.nio.ByteBuffer; -import java.nio.ByteOrder; import java.nio.CharBuffer; -import java.nio.channels.SeekableByteChannel; import java.nio.channels.WritableByteChannel; /** Resizable byte buffer to store string */ diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java index afcf312b..5d87fcdb 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java @@ -22,6 +22,7 @@ import java.nio.channels.WritableByteChannel; import java.util.ArrayList; import java.util.Collections; +import java.util.Objects; import java.util.StringJoiner; /** @@ -136,25 +137,28 @@ private int allocate(int length, int alignment) { if (fs.length < length) { continue; } + int end = fs.start + fs.length; int start = allocateInBlock(length, alignment, fs.start, end); - if (start != -1) { - int remaining = end - start - length; - if (remaining > 0) { - fs.start = start + length; - fs.length = remaining; - freeDirty = true; - // we need to recompute maxLength only if modifying the last (maximum) element - // in free lists - if (i == numFree - 1) { - maxLength = computeNewMaxLength(i); - } - } else { - free.remove(i); - maxLength = computeNewMaxLength(numFree - 2); + if (start == -1) { + continue; + } + + int remaining = end - start - length; + if (remaining > 0) { + fs.start = start + length; + fs.length = remaining; + freeDirty = true; + // we need to recompute maxLength only if modifying the last (maximum) element + // in free lists + if (i == numFree - 1) { + maxLength = computeNewMaxLength(i); } - return start; + } else { + free.remove(i); + maxLength = computeNewMaxLength(numFree - 2); } + return start; } maxLength = Math.max(0, maxLength - 1); } @@ -238,6 +242,23 @@ public int compareTo(FreeSpace o) { return Integer.compare(start, o.start); } + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null || !(obj instanceof FreeSpace)) { + return false; + } + FreeSpace other = (FreeSpace) obj; + return this.start == other.start && this.length == other.length; + } + + @Override + public int hashCode() { + return Objects.hash(start, length); + } + @Override public String toString() { return new StringJoiner(", ", "FreeSpace[", "]").add("start=" + start).add("length=" + length).toString(); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java index 045cb882..777ec0a2 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java @@ -22,6 +22,7 @@ import java.util.Arrays; import java.util.List; +import java.util.Objects; import java.util.regex.Pattern; /** @@ -81,6 +82,12 @@ public boolean equals(Object other) { LineNo o = (LineNo) other; return (line == o.line) && (isUser == o.isUser); } + + @Override + public int hashCode() { + return Objects.hash(line, isUser); + } + } /** @@ -117,6 +124,11 @@ public boolean equals(Object other) { Headword o = (Headword) other; return headword.equals(o.headword); } + + @Override + public int hashCode() { + return Objects.hash(headword); + } } /** @@ -173,6 +185,11 @@ public boolean equals(Object other) { Triple o = (Triple) other; return (headword.equals(o.headword)) && (posId == o.posId) && (reading.equals(o.reading)); } + + @Override + public int hashCode() { + return Objects.hash(headword, posId, reading); + } } private static final Pattern NUMERIC_RE = Pattern.compile("^U?\\d+$"); diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt index 0f43d753..d07d3d61 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt @@ -54,7 +54,7 @@ class DictionaryPrinterTest { assertEquals(41, lines.size) // header + entries + trailing new line assertEquals( - "Surface,LeftId,RightId,Cost,Pos1,Pos2,Pos3,Pos4,Pos5,Pos6,ReadingForm,NormalizedForm,DictionaryForm,SplitA,SplitB,SplitC,WordStructure,SynonymGroups,UserData", + "SURFACE,LEFT_ID,RIGHT_ID,COST,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", lines[0]) assertEquals("た,1,1,8729,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,,,,,,,,", lines[1]) assertEquals("に,2,2,11406,助詞,接続助詞,*,*,*,*,ニ,,,,,,,,", lines[2]) @@ -70,7 +70,7 @@ class DictionaryPrinterTest { assertEquals(6, lines.size) // header + entries + trailing new line assertEquals( - "Surface,LeftId,RightId,Cost,Pos1,Pos2,Pos3,Pos4,Pos5,Pos6,ReadingForm,NormalizedForm,DictionaryForm,SplitA,SplitB,SplitC,WordStructure,SynonymGroups,UserData", + "SURFACE,LEFT_ID,RIGHT_ID,COST,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", lines[0]) assertEquals( "東京府,6,6,2816,名詞,固有名詞,地名,一般,*,*,トウキョウフ,,,\"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/府,名詞,普通名詞,一般,*,*,*,フ\",,,\"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/府,名詞,普通名詞,一般,*,*,*,フ\",1/3,", diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/CSVParserTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/CSVParserTest.java index 84f0e647..d13cc679 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/CSVParserTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/CSVParserTest.java @@ -31,91 +31,91 @@ public class CSVParserTest { @Test public void empty() throws IOException { try (CSVParser parser = new CSVParser(new StringReader(""))) { - assertNull(parser.getNextRecord()); + assertNull(parser.getNextRow()); } try (CSVParser parser = new CSVParser(new StringReader("\n"))) { - assertTrue(parser.getNextRecord().isEmpty()); - assertNull(parser.getNextRecord()); + assertTrue(parser.getNextRow().isEmpty()); + assertNull(parser.getNextRow()); } try (CSVParser parser = new CSVParser(new StringReader("\n\n"))) { - assertTrue(parser.getNextRecord().isEmpty()); - assertTrue(parser.getNextRecord().isEmpty()); - assertNull(parser.getNextRecord()); + assertTrue(parser.getNextRow().isEmpty()); + assertTrue(parser.getNextRow().isEmpty()); + assertNull(parser.getNextRow()); } } @Test public void unescapedField() throws IOException { try (CSVParser parser = new CSVParser(new StringReader("abc,def,ghi\nabc,def,ghi"))) { - assertThat(parser.getNextRecord(), contains("abc", "def", "ghi")); - assertThat(parser.getNextRecord(), contains("abc", "def", "ghi")); - assertNull(parser.getNextRecord()); + assertThat(parser.getNextRow(), contains("abc", "def", "ghi")); + assertThat(parser.getNextRow(), contains("abc", "def", "ghi")); + assertNull(parser.getNextRow()); } try (CSVParser parser = new CSVParser(new StringReader("abc,def,"))) { - assertThat(parser.getNextRecord(), contains("abc", "def", "")); - assertNull(parser.getNextRecord()); + assertThat(parser.getNextRow(), contains("abc", "def", "")); + assertNull(parser.getNextRow()); } try (CSVParser parser = new CSVParser(new StringReader("abc,def,\n"))) { - assertThat(parser.getNextRecord(), contains("abc", "def", "")); - assertNull(parser.getNextRecord()); + assertThat(parser.getNextRow(), contains("abc", "def", "")); + assertNull(parser.getNextRow()); } try (CSVParser parser = new CSVParser(new StringReader(",,ghi"))) { - assertThat(parser.getNextRecord(), contains("", "", "ghi")); - assertNull(parser.getNextRecord()); + assertThat(parser.getNextRow(), contains("", "", "ghi")); + assertNull(parser.getNextRow()); } } @Test public void escapedField() throws IOException { try (CSVParser parser = new CSVParser(new StringReader("abc,\"def\",ghi\nabc,def,ghi"))) { - assertThat(parser.getNextRecord(), contains("abc", "def", "ghi")); - assertThat(parser.getNextRecord(), contains("abc", "def", "ghi")); - assertNull(parser.getNextRecord()); + assertThat(parser.getNextRow(), contains("abc", "def", "ghi")); + assertThat(parser.getNextRow(), contains("abc", "def", "ghi")); + assertNull(parser.getNextRow()); } try (CSVParser parser = new CSVParser(new StringReader("abc,def,\"ghi\nabc\",def,ghi"))) { - assertThat(parser.getNextRecord(), contains("abc", "def", "ghi\nabc", "def", "ghi")); - assertNull(parser.getNextRecord()); + assertThat(parser.getNextRow(), contains("abc", "def", "ghi\nabc", "def", "ghi")); + assertNull(parser.getNextRow()); } try (CSVParser parser = new CSVParser(new StringReader("abc,\"def,ghi\""))) { - assertThat(parser.getNextRecord(), contains("abc", "def,ghi")); - assertNull(parser.getNextRecord()); + assertThat(parser.getNextRow(), contains("abc", "def,ghi")); + assertNull(parser.getNextRow()); } try (CSVParser parser = new CSVParser(new StringReader("abc,\"def\"\"ghi\""))) { - assertThat(parser.getNextRecord(), contains("abc", "def\"ghi")); - assertNull(parser.getNextRecord()); + assertThat(parser.getNextRow(), contains("abc", "def\"ghi")); + assertNull(parser.getNextRow()); } try (CSVParser parser = new CSVParser(new StringReader("abc,def,\"\""))) { - assertThat(parser.getNextRecord(), contains("abc", "def", "")); - assertNull(parser.getNextRecord()); + assertThat(parser.getNextRow(), contains("abc", "def", "")); + assertNull(parser.getNextRow()); } try (CSVParser parser = new CSVParser(new StringReader("abc,def,\"\"\n"))) { - assertThat(parser.getNextRecord(), contains("abc", "def", "")); - assertNull(parser.getNextRecord()); + assertThat(parser.getNextRow(), contains("abc", "def", "")); + assertNull(parser.getNextRow()); } try (CSVParser parser = new CSVParser(new StringReader("\"\",\"\",ghi"))) { - assertThat(parser.getNextRecord(), contains("", "", "ghi")); - assertNull(parser.getNextRecord()); + assertThat(parser.getNextRow(), contains("", "", "ghi")); + assertNull(parser.getNextRow()); } } @Test(expected = IllegalArgumentException.class) public void escapedFieldWithExtraText() throws IOException { try (CSVParser parser = new CSVParser(new StringReader("\"abc\"def"))) { - parser.getNextRecord(); + parser.getNextRow(); } } @Test(expected = IllegalArgumentException.class) public void unClosedEscapedField() throws IOException { try (CSVParser parser = new CSVParser(new StringReader("\"abc"))) { - parser.getNextRecord(); + parser.getNextRow(); } } @Test(expected = IllegalArgumentException.class) public void unscapedFieldWithDoubleQuote() throws IOException { try (CSVParser parser = new CSVParser(new StringReader("a\"bc"))) { - parser.getNextRecord(); + parser.getNextRow(); } } } \ No newline at end of file From 763ffa7c4c52e34fa4b954dfb4138be23126fe6d Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 5 Aug 2024 09:56:22 +0900 Subject: [PATCH 59/94] allow pos file only once and for system, fix column existance check --- .../sudachi/dictionary/build/DicBuilder.java | 65 +++++++++---------- .../sudachi/dictionary/build/POSTable.java | 6 ++ .../dictionary/build/RawLexiconReader.java | 23 ++++--- 3 files changed, 53 insertions(+), 41 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java index 72aa403b..ea3a057b 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java @@ -72,39 +72,6 @@ public T progress(Progress progress) { return self(); } - /** - * Read POS list from the csv file. - */ - public T posTable(String name, IOSupplier input, long size) throws IOException { - progress.startBlock(name, nanoTime(), Progress.Kind.ENTRY); - int nRead; - try (InputStream is = input.get()) { - InputStream stream = new ProgressInputStream(is, size, progress); - nRead = pos.readEntries(stream); - } - progress.endBlock(nRead, nanoTime()); - return self(); - } - - /** - * Read POS list from the csv file. - */ - public T posTable(URL url) throws IOException { - String name = url.getPath(); - URLConnection conn = url.openConnection(); - long size = conn.getContentLengthLong(); - return posTable(name, conn::getInputStream, size); - } - - /** - * Read POS list from the csv file. - */ - public T posTable(Path path) throws IOException { - String name = path.getFileName().toString(); - long size = Files.size(path); - return posTable(name, () -> Files.newInputStream(path), size); - } - /** * Import words from the csv lexicon into the binary dictionary compiler. * @@ -252,6 +219,38 @@ public System signature(String signature) { description.setSignature(signature); return this; } + + /** Read POS list from the csv file. */ + public System posTable(String name, IOSupplier input, long size) throws IOException { + if (!pos.allowNewPos) { + throw new IllegalArgumentException("POS list already loaded (only single POS file is allowed)."); + } + pos.allowNewPos = false; + + progress.startBlock(name, nanoTime(), Progress.Kind.ENTRY); + int nRead; + try (InputStream is = input.get()) { + InputStream stream = new ProgressInputStream(is, size, progress); + nRead = pos.readEntries(stream); + } + progress.endBlock(nRead, nanoTime()); + return this; + } + + /** Read POS list from the csv file. */ + public System posTable(URL url) throws IOException { + String name = url.getPath(); + URLConnection conn = url.openConnection(); + long size = conn.getContentLengthLong(); + return posTable(name, conn::getInputStream, size); + } + + /** Read POS list from the csv file. */ + public System posTable(Path path) throws IOException { + String name = path.getFileName().toString(); + long size = Files.size(path); + return posTable(name, () -> Files.newInputStream(path), size); + } } /** diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java index 17cfb496..834f21b1 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java @@ -33,6 +33,7 @@ public class POSTable { private final List table = new ArrayList<>(); private final HashMap lookup = new HashMap<>(); + public boolean allowNewPos = true; // number of pos loaded from the system dictionary. private int builtin = 0; @@ -44,6 +45,11 @@ public class POSTable { */ short getId(POS s) { return lookup.computeIfAbsent(s, p -> { + if (!allowNewPos) { + throw new IllegalArgumentException( + String.format("POS %s is not present in the table and new POS is not allowed", s)); + } + int next = table.size(); if (next >= MAX_POS_NUMBER) { throw new IllegalArgumentException("maximum POS number exceeded by " + s); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index d380408f..fd12697f 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -54,8 +54,6 @@ public enum Column { private final WordRef.Parser normRefParser; // for normalized form private final WordRef.Parser dictRefParser; // for dictionary form private final WordRef.Parser splitParser; // for splits - private boolean posIdExists = false; - private boolean posStrExists = true; public RawLexiconReader(CSVParser parser, POSTable pos, boolean user) throws IOException { this.parser = parser; @@ -86,6 +84,7 @@ private void resolveColumnLayout() throws IOException { String leftId = row.get(Column.LEFT_ID.ordinal()); if (INTEGER_REGEX.matcher(leftId).matches()) { this.cachedRow = row; + this.mapping = null; return; } @@ -119,7 +118,7 @@ private void resolveColumnLayout() throws IOException { } } - this.posIdExists = mapping[Column.POS_ID.ordinal()] >= 0; + boolean posIdExists = mapping[Column.POS_ID.ordinal()] >= 0; long numPosColumnsFound = Arrays .asList(Column.POS1, Column.POS2, Column.POS3, Column.POS4, Column.POS5, Column.POS6).stream() .filter(c -> mapping[c.ordinal()] >= 0).count(); @@ -127,7 +126,7 @@ private void resolveColumnLayout() throws IOException { throw new CsvFieldException(parser.getName(), 0, "POS", new IllegalArgumentException("Pos1 ~ Pos6 columns must appear as a set.")); } - this.posStrExists = numPosColumnsFound == POS.DEPTH; + boolean posStrExists = numPosColumnsFound == POS.DEPTH; if (!posIdExists && !posStrExists) { throw new CsvFieldException(parser.getName(), 0, "POS", new IllegalArgumentException("Both or either PosId column or Pos1~Pos6 columns are required.")); @@ -245,25 +244,33 @@ private WordRef getWordRef(List data, Column column, WordRef.Parser refP /** parse POS columns. */ private short getPos(List data) { + boolean idColumnExists = false; + boolean strColumnExists = true; + if (!isLegacyColumnLayout()) { + idColumnExists = mapping[Column.POS_ID.ordinal()] >= 0; + // existance of POS1-6 is checked in column layout resolution + strColumnExists = mapping[Column.POS1.ordinal()] >= 0; + } + short posId = -1; short posStrId = -1; - if (this.posIdExists) { + if (idColumnExists) { posId = getShort(data, Column.POS_ID); } - if (this.posStrExists) { + if (strColumnExists) { POS pos = new POS( // comment for line break get(data, Column.POS1, true), get(data, Column.POS2, true), get(data, Column.POS3, true), get(data, Column.POS4, true), get(data, Column.POS5, true), get(data, Column.POS6, true)); posStrId = posTable.getId(pos); } - if (this.posIdExists && this.posStrExists && posId != posStrId) { + if (idColumnExists && strColumnExists && posId != posStrId) { throw new CsvFieldException(parser.getName(), parser.getRowCount(), "POS", new IllegalArgumentException( String.format("PosId (%d) and id from Pos1-6 (%d) does not match.", posId, posStrId))); } - return this.posIdExists ? posId : posStrId; + return idColumnExists ? posId : posStrId; } /** convert csv row to RawWordEntry */ From d3033621c425d19eb6e1ff420c14bbaca7005bea Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 5 Aug 2024 10:59:22 +0900 Subject: [PATCH 60/94] default signature uses comment hash instead of random --- .../nlp/sudachi/dictionary/Description.java | 16 +++++++++++----- .../sudachi/dictionary/DictionaryBuilder.java | 4 +--- .../nlp/sudachi/dictionary/build/DicBuilder.java | 10 +++++++--- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java index d10494f3..c7603734 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java @@ -30,7 +30,6 @@ import java.util.ArrayList; import java.util.List; import java.util.Locale; -import java.util.Random; /** * Description of the dictionary blocks, in-memory representation. Basically, an @@ -39,13 +38,12 @@ public class Description { private Instant creationTime = Instant.now(); private String comment = ""; - private String signature = defaultSignature(creationTime); + private String signature = defaultSignature(creationTime, comment); private String reference = ""; private List blocks = new ArrayList<>(); private long flags; private int numTotalEntries; private int numIndexedEntries; - private Random random = new Random(); /** * Return a slice of the full dictionary with the provided name @@ -220,10 +218,10 @@ private static void checkLegacyDictionaryFormat(ByteBuffer raw) { } } - private String defaultSignature(Instant date) { + private String defaultSignature(Instant date, String comment) { DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMMddHHmmss", Locale.US); return String.format("%s-%08x", formatter.format(LocalDateTime.ofInstant(date, ZoneId.systemDefault())), - random.nextLong()); + comment.hashCode()); } public Instant getCreationTime() { @@ -258,6 +256,14 @@ public void setSignature(String signature) { this.signature = signature; } + /** + * Overwrite signature by the default value with the current creationTime and + * comment. + */ + public void setDefaultSignature() { + this.signature = defaultSignature(creationTime, comment); + } + public String getReference() { return reference; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilder.java index f2b3a066..4e5f6014 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilder.java @@ -53,12 +53,10 @@ private static void build(Path matrixPath, String description, String posPath, S if (posPath != null) { builder = builder.posTable(Paths.get(posPath)); } - if (signature != null) { - builder = builder.signature(signature); - } for (String lexiconPath : lexiconPaths) { builder = builder.lexicon(Paths.get(lexiconPath)); } + builder = builder.signature(signature); try (SeekableByteChannel ch = Files.newByteChannel(outputPath, StandardOpenOption.WRITE, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java index ea3a057b..7ba87013 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java @@ -202,8 +202,10 @@ private System readMatrix(String name, IOSupplier input, long size) } /** - * Set the system dictionary signature to the provided string. By default, it is - * current timestamp and a random 8 hexadecimal characters. + * Set the system dictionary signature to the provided string. + * + * If null is provided, set the default value that consists of current timestamp + * and a 8 hexadecimal hashcode calculated from the comment. * * @param signature * provided dictionary signature. Can not be empty. @@ -211,8 +213,10 @@ private System readMatrix(String name, IOSupplier input, long size) */ public System signature(String signature) { if (signature == null) { - throw new IllegalArgumentException("signature can not be null"); + description.setDefaultSignature(); + return this; } + if (signature.isEmpty()) { throw new IllegalArgumentException("signature can not be empty"); } From c217631f5f51795a301d3ba4bc429674bfc594db Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 5 Aug 2024 15:09:11 +0900 Subject: [PATCH 61/94] reduce memory usage during StringStorage compile --- .../dictionary/build/RawWordEntry.java | 3 ++ .../dictionary/build/StringStorage.java | 7 ++-- .../dictionary/build/RawLexiconReaderTest.kt | 32 +++++++++---------- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java index 5691aecf..2d98efc7 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java @@ -106,6 +106,9 @@ private void checkString(String value, String name) { public void validate() { checkString(headword, "headword"); checkString(reading, "reading"); + if (normalizedForm instanceof WordRef.Headword) { + checkString(((WordRef.Headword) normalizedForm).getHeadword(), "normalized form"); + } } /** diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java index 00a37064..589aa08b 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java @@ -88,10 +88,11 @@ private Item process(String str) { int end = offsets[j]; String sub = str.substring(start, end); // Create a possible substring only if - // 1. It does not exist yet - // 2. Can form a valid pointer to it (string pointer requires aligned offset + // 1. It will be used later + // 2. It does not exist yet + // 3. Can form a valid pointer to it (string pointer requires aligned offset // based on str length) - if (!candidates.containsKey(sub) && ptr.isSubseqValid(start, end)) { + if (strings.containsKey(sub) && !candidates.containsKey(sub) && ptr.isSubseqValid(start, end)) { Item item = new Item(str, start, end); item.root = full; candidates.put(sub, item); diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt index cee382cf..df4758a0 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt @@ -19,6 +19,7 @@ package com.worksap.nlp.sudachi.dictionary.build import com.worksap.nlp.sudachi.dictionary.StringPtr import com.worksap.nlp.sudachi.resStream import java.io.StringReader +import kotlin.test.Ignore import kotlin.test.Test import kotlin.test.assertEquals import kotlin.test.assertFails @@ -95,30 +96,28 @@ class RawLexiconReaderTest { skipVals.removeAt(i) val text = skipCols.joinToString(",") + "\n" + skipVals.joinToString(",") - assertFails { - val reader = RawLexiconReader(csvtext(text), POSTable(), false) - } + assertFails { RawLexiconReader(csvtext(text), POSTable(), false) } } } @Test fun failTooLongValue() { - val oversizeWord = "a".repeat(StringPtr.MAX_LENGTH + 1); - { + val oversizeWord = "a".repeat(StringPtr.MAX_LENGTH + 1) + run { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure ${oversizeWord},6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,1,,,""" val reader = RawLexiconReader(csvtext(text), POSTable(), false) assertFails { reader.nextEntry() } } - { + run { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,${oversizeWord},,,1,,,""" val reader = RawLexiconReader(csvtext(text), POSTable(), false) assertFails { reader.nextEntry() } } - { + run { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,${oversizeWord},,1,,,""" @@ -137,29 +136,30 @@ ${oversizeWord},6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウ } @Test + @Ignore // Currently single split list is allowed. fun failSingleSplit() { - { + run { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,1,,,""" val reader = RawLexiconReader(csvtext(text), POSTable(), false) assertFails { reader.nextEntry() } } - { + run { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,1,,""" val reader = RawLexiconReader(csvtext(text), POSTable(), false) assertFails { reader.nextEntry() } } - { + run { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,1,""" val reader = RawLexiconReader(csvtext(text), POSTable(), false) assertFails { reader.nextEntry() } } - { + run { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,,1""" @@ -171,30 +171,30 @@ ${oversizeWord},6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウ @Test fun failTooManySplit() { val oversizeSplit: String = - generateSequence { "1" }.take(Byte.MAX_VALUE.toInt() + 1).joinToString("/"); + generateSequence { "1" }.take(Byte.MAX_VALUE.toInt() + 1).joinToString("/") - { + run { var text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,${oversizeSplit},,,""" var reader = RawLexiconReader(csvtext(text), POSTable(), false) assertFails { reader.nextEntry() } } - { + run { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,${oversizeSplit},,""" val reader = RawLexiconReader(csvtext(text), POSTable(), false) assertFails { reader.nextEntry() } } - { + run { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,${oversizeSplit},""" val reader = RawLexiconReader(csvtext(text), POSTable(), false) assertFails { reader.nextEntry() } } - { + run { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,,${oversizeSplit}""" From 37b2d87f954f882fa4560b5a38fdb5bf72459a20 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 5 Aug 2024 15:47:10 +0900 Subject: [PATCH 62/94] rm duplicate test resources --- src/test/dict/lex.csv | 39 --------------- src/test/dict/matrix.def | 101 --------------------------------------- src/test/dict/user.csv | 4 -- src/test/dict/user2.csv | 2 - 4 files changed, 146 deletions(-) delete mode 100644 src/test/dict/lex.csv delete mode 100644 src/test/dict/matrix.def delete mode 100644 src/test/dict/user.csv delete mode 100644 src/test/dict/user2.csv diff --git a/src/test/dict/lex.csv b/src/test/dict/lex.csv deleted file mode 100644 index 6b8fb50d..00000000 --- a/src/test/dict/lex.csv +++ /dev/null @@ -1,39 +0,0 @@ -た,1,1,8729,た,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,*,A,*,*,*,* -に,2,2,11406,に,助詞,接続助詞,*,*,*,*,ニ,に,*,A,*,*,*,* -に,3,3,4481,に,助詞,格助詞,*,*,*,*,ニ,に,*,A,*,*,*,* -京都,6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*,1/5 -東,7,7,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,*,A,*,*,*,* -東京,6,6,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* -東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,*,5/9,* -行く,4,4,5105,行く,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,* -行っ,5,5,5122,行っ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,7,A,*,*,*,* -都,8,8,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,* -アイ,7,7,4675,アイ,名詞,普通名詞,一般,*,*,*,アイ,アイ,*,A,*,*,*,* -アイウ,7,7,4675,アイウ,名詞,普通名詞,一般,*,*,*,アイウ,アイウ,*,A,*,*,*,* -アイアイウ,6,6,32766,アイウ,名詞,固有名詞,地名,一般,*,*,アイアイウ,アイアイウ,*,A,*,*,*,* -0,9,9,2478,0,名詞,数詞,*,*,*,*,ゼロ,0,*,A,*,*,*,* -1,9,9,2478,1,名詞,数詞,*,*,*,*,イチ,1,*,A,*,*,*,* -2,9,9,2478,2,名詞,数詞,*,*,*,*,ニ,2,*,A,*,*,*,* -3,9,9,2478,3,名詞,数詞,*,*,*,*,サン,3,*,A,*,*,*,* -4,9,9,2478,4,名詞,数詞,*,*,*,*,ヨン,4,*,A,*,*,*,* -5,9,9,2478,5,名詞,数詞,*,*,*,*,ゴ,5,*,A,*,*,*,* -6,9,9,2478,6,名詞,数詞,*,*,*,*,ロク,6,*,A,*,*,*,* -7,9,9,2478,7,名詞,数詞,*,*,*,*,ナナ,7,*,A,*,*,*,* -8,9,9,2478,8,名詞,数詞,*,*,*,*,ハチ,8,*,A,*,*,*,* -9,9,9,2478,9,名詞,数詞,*,*,*,*,キュウ,9,*,A,*,*,*,* -〇,9,9,2478,〇,名詞,数詞,*,*,*,*,ゼロ,〇,*,A,*,*,*,* -一,9,9,2478,一,名詞,数詞,*,*,*,*,イチ,一,*,A,*,*,*,* -二,9,9,2478,二,名詞,数詞,*,*,*,*,ニ,二,*,A,*,*,*,* -三,9,9,2478,三,名詞,数詞,*,*,*,*,サン,三,*,A,*,*,*,* -四,9,9,2478,四,名詞,数詞,*,*,*,*,ヨン,四,*,A,*,*,*,* -五,9,9,2478,五,名詞,数詞,*,*,*,*,ゴ,五,*,A,*,*,*,* -六,9,9,2478,六,名詞,数詞,*,*,*,*,ロク,六,*,A,*,*,*,* -七,9,9,2478,七,名詞,数詞,*,*,*,*,ナナ,七,*,A,*,*,*,* -八,9,9,2478,八,名詞,数詞,*,*,*,*,ハチ,八,*,A,*,*,*,* -九,9,9,2478,九,名詞,数詞,*,*,*,*,キュウ,九,*,A,*,*,*,* -六三四,6,6,0,六三四,名詞,固有名詞,地名,一般,*,*,ムサシ,六三四,*,A,*,*,*,* -いく,4,4,5105,いく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,* -いっ,5,5,5122,いっ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,A,*,*,*,* -012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,2478,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,*,A,*,*,*,* -特a,8,8,2914,特a,名詞,普通名詞,一般,*,*,*,トクエー,特a,*,A,*,*,*,* -な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,*,A,*,*,*,* \ No newline at end of file diff --git a/src/test/dict/matrix.def b/src/test/dict/matrix.def deleted file mode 100644 index de3529aa..00000000 --- a/src/test/dict/matrix.def +++ /dev/null @@ -1,101 +0,0 @@ -10 10 -0 0 0 -0 1 863 -0 2 2124 -0 3 1032 -0 4 591 -0 5 -162 -0 6 -79 -0 7 887 -0 8 447 -0 9 -535 -1 0 -3689 -1 1 -3361 -1 2 -7643 -1 3 -3267 -1 4 809 -1 5 -1098 -1 6 4606 -1 7 4269 -1 8 4567 -1 9 1635 -2 0 -1959 -2 1 2457 -2 2 811 -2 3 840 -2 4 903 -2 5 -958 -2 6 517 -2 7 2037 -2 8 1392 -2 9 -193 -3 0 -2288 -3 1 1741 -3 2 487 -3 3 792 -3 4 -1474 -3 5 -3429 -3 6 126 -3 7 437 -3 8 605 -3 9 -547 -4 0 -2809 -4 1 -3584 -4 2 -6743 -4 3 -2869 -4 4 -2805 -4 5 -407 -4 6 3422 -4 7 5642 -4 8 6382 -4 9 2165 -5 0 -509 -5 1 -3665 -5 2 -3882 -5 3 -572 -5 4 -1036 -5 5 -54 -5 6 2570 -5 7 3319 -5 8 4059 -5 9 882 -6 0 101 -6 1 2933 -6 2 2198 -6 3 -2004 -6 4 4392 -6 5 4017 -6 6 569 -6 7 475 -6 8 -390 -6 9 852 -7 0 -852 -7 1 2079 -7 2 1180 -7 3 -3084 -7 4 2010 -7 5 1570 -7 6 746 -7 7 2341 -7 8 2051 -7 9 1393 -8 0 -522 -8 1 3354 -8 2 2037 -8 3 -2542 -8 4 3071 -8 5 2631 -8 6 -352 -8 7 2847 -8 8 1134 -8 9 1256 -9 0 -975 -9 1 2498 -9 2 1690 -9 3 -1523 -9 4 3023 -9 5 3139 -9 6 2562 -9 7 3962 -9 8 418 -9 9 -2490 diff --git a/src/test/dict/user.csv b/src/test/dict/user.csv deleted file mode 100644 index a81d53c5..00000000 --- a/src/test/dict/user.csv +++ /dev/null @@ -1,4 +0,0 @@ -ぴらる,8,8,-32768,ぴらる,名詞,普通名詞,一般,*,*,*,ピラル,ぴらる,*,A,*,*,*,* -府,8,8,2914,府,名詞,普通名詞,一般,*,*,*,フ,府,*,A,*,*,*,* -東京府,6,6,2816,東京府,名詞,固有名詞,地名,一般,*,*,トウキョウフ,東京府,*,B,5/U1,*,5/U1,1/3 -すだち,6,6,2816,すだち,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,すだち,*,A,*,*,*,* diff --git a/src/test/dict/user2.csv b/src/test/dict/user2.csv deleted file mode 100644 index f675ddd9..00000000 --- a/src/test/dict/user2.csv +++ /dev/null @@ -1,2 +0,0 @@ -ぴさる,8,8,-32768,ぴさる,名詞,普通名詞,一般,*,*,*,ピサル,ぴさる,*,A,*,*,*,* -かぼす,6,6,2816,かぼす,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,カボス,カボス,かぼす,*,A,*,*,*,* From e9e704bd9a99b88737839a56a8168ff30746b7c9 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 5 Aug 2024 17:52:54 +0900 Subject: [PATCH 63/94] rename classes --- .../dictionary/{Blocks.java => Block.java} | 4 +- .../nlp/sudachi/dictionary/Description.java | 245 ++++++++++-------- .../dictionary/DictionaryHeaderPrinter.java | 2 +- .../dictionary/DoubleArrayLexicon.java | 12 +- .../nlp/sudachi/dictionary/GrammarImpl.java | 4 +- .../sudachi/dictionary/build/BlockLayout.java | 34 +-- .../dictionary/build/CompiledWordEntry.java | 12 +- .../sudachi/dictionary/build/DicBuilder.java | 10 +- .../build/{Lookup2.java => EntryLookup.java} | 4 +- .../nlp/sudachi/dictionary/build/Index.java | 10 +- .../sudachi/dictionary/build/RawLexicon.java | 11 +- .../dictionary/build/RawWordEntry.java | 2 +- .../{WordLayout.java => StringLayout.java} | 33 ++- .../dictionary/build/StringStorage.java | 2 +- .../build/UnicodeBufferResizeable.java | 51 ---- .../dictionary/build/WordEntryLayout.java | 4 +- .../nlp/sudachi/dictionary/build/WordRef.java | 16 +- .../nlp/sudachi/dictionary/DescriptionTest.kt | 2 +- .../sudachi/dictionary/build/GrammarTest.kt | 10 +- ...{WordLayoutTest.kt => StringLayoutTest.kt} | 10 +- 20 files changed, 225 insertions(+), 253 deletions(-) rename src/main/java/com/worksap/nlp/sudachi/dictionary/{Blocks.java => Block.java} (95%) rename src/main/java/com/worksap/nlp/sudachi/dictionary/build/{Lookup2.java => EntryLookup.java} (96%) rename src/main/java/com/worksap/nlp/sudachi/dictionary/build/{WordLayout.java => StringLayout.java} (90%) delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java rename src/test/java/com/worksap/nlp/sudachi/dictionary/build/{WordLayoutTest.kt => StringLayoutTest.kt} (95%) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Blocks.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Block.java similarity index 95% rename from src/main/java/com/worksap/nlp/sudachi/dictionary/Blocks.java rename to src/main/java/com/worksap/nlp/sudachi/dictionary/Block.java index 1f1fba04..9f832169 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Blocks.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Block.java @@ -16,8 +16,8 @@ package com.worksap.nlp.sudachi.dictionary; -public class Blocks { - private Blocks() { +public class Block { + private Block() { } public static final String WORD_POINTERS = "WordPointers"; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java index c7603734..eba97b01 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Description.java @@ -30,102 +30,37 @@ import java.util.ArrayList; import java.util.List; import java.util.Locale; +import java.util.StringJoiner; /** * Description of the dictionary blocks, in-memory representation. Basically, an * extended version of the dictionary header. */ public class Description { + private static final byte[] MAGIC_BYTES = "SudachiBinaryDic".getBytes(StandardCharsets.UTF_8); + private Instant creationTime = Instant.now(); private String comment = ""; private String signature = defaultSignature(creationTime, comment); private String reference = ""; - private List blocks = new ArrayList<>(); + private List blocks = new ArrayList<>(); private long flags; private int numTotalEntries; private int numIndexedEntries; - /** - * Return a slice of the full dictionary with the provided name - * - * @param full - * ByteBuffer which represents the whole dictionary loaded into - * memory - * @param part - * name of the required part - * @return slice of the ByteBuffer - * @throws IllegalArgumentException - * if the part with the provided name was not found - */ - public ByteBuffer slice(ByteBuffer full, String part) { - ByteBuffer slice = sliceOrNull(full, part); - if (slice == null) { - throw new IllegalArgumentException("Dictionary did not contain part with name=" + part); - } - return slice; - } - - /** - * Return a slice of the full dictionary with the provided name - * - * @param full - * ByteBuffer which represents the whole dictionary loaded into - * memory - * @param part - * name of the required part - * @return slice of the ByteBuffer or null if not found - */ - public ByteBuffer sliceOrNull(ByteBuffer full, String part) { - for (Block b : blocks) { - if (b.name.equals(part)) { - int start = (int) b.start; - int end = (int) (b.start + b.size); - int position = full.position(); - int limit = full.limit(); - full.position(start); - full.limit(end); - ByteBuffer slice = full.slice(); - full.position(position); - full.limit(limit); - slice.order(ByteOrder.LITTLE_ENDIAN); - return slice; - } - } - return null; - } - - public boolean isSystemDictionary() { - return reference.isEmpty(); - } - - public boolean isUserDictionary() { - return !reference.isEmpty(); - } - - public static class Block { - private final String name; - private final long start; - private final long size; - - public Block(String name, long start, long size) { - this.name = name; - this.start = start; - this.size = size; - } - - public String getName() { - return name; - } - - public long getStart() { - return start; - } - - public long getSize() { - return size; + /** Load Description from bytes. */ + public static Description load(ByteBuffer raw) { + checkLegacyDictionaryFormat(raw); + checkMagic(raw); + long version = raw.getLong(); + if (version == 1) { + return loadV1(raw); + } else { + throw new IllegalArgumentException(String.format("invalid version %d, corrupted dictionary", version)); } } + /** Load Description from the first block of the channel. */ public static Description load(SeekableByteChannel channel) throws IOException { ByteBuffer buf = ByteBuffer.allocate(4096); buf.order(ByteOrder.LITTLE_ENDIAN); @@ -136,17 +71,28 @@ public static Description load(SeekableByteChannel channel) throws IOException { return load(buf); } - public static Description load(ByteBuffer raw) { - checkLegacyDictionaryFormat(raw); - checkMagic(raw); - long version = raw.getLong(); - if (version == 1) { - return loadV1(raw); - } else { - throw new IllegalArgumentException(String.format("invalid version %d, corrupted dictionary", version)); + private static void checkMagic(ByteBuffer raw) { + assert MAGIC_BYTES.length == 16; + byte[] expected = new byte[MAGIC_BYTES.length]; + raw.get(expected); + for (int i = 0; i < expected.length; i++) { + if (MAGIC_BYTES[i] != expected[i]) { + throw new IllegalArgumentException("invalid magic string, dictionary is corrupted"); + } + } + } + + private static void checkLegacyDictionaryFormat(ByteBuffer raw) { + long version = raw.getLong(0); + if (LegacyDictionaryVersion.isSystemDictionary(version)) { + throw new IllegalArgumentException("passed dictionary is a legacy system dictionary, please rebuild it"); + } + if (LegacyDictionaryVersion.isUserDictionary(version)) { + throw new IllegalArgumentException("passed dictionary is a legacy user dictionary, please rebuild it"); } } + /** Load V1 format description. */ private static Description loadV1(ByteBuffer raw) { Description desc = new Description(); BufReader reader = new BufReader(raw); @@ -159,13 +105,14 @@ private static Description loadV1(ByteBuffer raw) { desc.numTotalEntries = reader.readVarint32(); int length = reader.readVarint32(); for (int i = 0; i < length; ++i) { - Block b = new Block(reader.readUtf8String(), reader.readVarint64(), reader.readVarint64()); + BlockInfo b = new BlockInfo(reader.readUtf8String(), reader.readVarint64(), reader.readVarint64()); desc.blocks.add(b); } return desc; } + /** Save this Description (V1 format). */ public void save(SeekableByteChannel channel) throws IOException { ByteBuffer buff = ByteBuffer.allocate(4096); buff.order(ByteOrder.LITTLE_ENDIAN); @@ -181,10 +128,10 @@ public void save(SeekableByteChannel channel) throws IOException { writer.putVarint32(numTotalEntries); int length = blocks.size(); writer.putVarint32(length); - for (Block b : blocks) { - writer.putUtf8String(b.name); - writer.putVarint64(b.start); - writer.putVarint64(b.size); + for (BlockInfo b : blocks) { + writer.putUtf8String(b.getName()); + writer.putVarint64(b.getStart()); + writer.putVarint64(b.getSize()); } // write to the first block @@ -195,40 +142,102 @@ public void save(SeekableByteChannel channel) throws IOException { channel.position(pos); } - private static final byte[] MAGIC_BYTES = "SudachiBinaryDic".getBytes(StandardCharsets.UTF_8); + public static class BlockInfo { + private final String name; + private final long start; + private final long size; - private static void checkMagic(ByteBuffer raw) { - assert MAGIC_BYTES.length == 16; - byte[] expected = new byte[MAGIC_BYTES.length]; - raw.get(expected); - for (int i = 0; i < expected.length; i++) { - if (MAGIC_BYTES[i] != expected[i]) { - throw new IllegalArgumentException("invalid magic string, dictionary is corrupted"); - } + public BlockInfo(String name, long start, long size) { + this.name = name; + this.start = start; + this.size = size; } - } - private static void checkLegacyDictionaryFormat(ByteBuffer raw) { - long version = raw.getLong(0); - if (LegacyDictionaryVersion.isSystemDictionary(version)) { - throw new IllegalArgumentException("passed dictionary is a legacy system dictionary, please rebuild it"); + public String getName() { + return name; } - if (LegacyDictionaryVersion.isUserDictionary(version)) { - throw new IllegalArgumentException("passed dictionary is a legacy user dictionary, please rebuild it"); + + public long getStart() { + return start; + } + + public long getSize() { + return size; + } + + public long getEnd() { + return start + size; + } + + @Override + public String toString() { + return new StringJoiner(", ", BlockInfo.class.getSimpleName() + "[", "]").add("name='" + name + "'") + .add("start=" + start).add("end=" + getEnd()).toString(); } } - private String defaultSignature(Instant date, String comment) { - DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMMddHHmmss", Locale.US); - return String.format("%s-%08x", formatter.format(LocalDateTime.ofInstant(date, ZoneId.systemDefault())), - comment.hashCode()); + public boolean isSystemDictionary() { + return reference.isEmpty(); + } + + public boolean isUserDictionary() { + return !reference.isEmpty(); + } + + /** + * Return a slice of the full dictionary with the provided name + * + * @param full + * ByteBuffer which represents the whole dictionary loaded into + * memory + * @param part + * name of the required part + * @return slice of the ByteBuffer + * @throws IllegalArgumentException + * if the part with the provided name was not found + */ + public ByteBuffer slice(ByteBuffer full, String part) { + ByteBuffer slice = sliceOrNull(full, part); + if (slice == null) { + throw new IllegalArgumentException("Dictionary did not contain part with name=" + part); + } + return slice; + } + + /** + * Return a slice of the full dictionary with the provided name + * + * @param full + * ByteBuffer which represents the whole dictionary loaded into + * memory + * @param part + * name of the required part + * @return slice of the ByteBuffer or null if not found + */ + public ByteBuffer sliceOrNull(ByteBuffer full, String part) { + for (BlockInfo b : blocks) { + if (b.getName().equals(part)) { + int start = (int) b.getStart(); + int end = (int) (b.getEnd()); + int position = full.position(); + int limit = full.limit(); + full.position(start); + full.limit(end); + ByteBuffer slice = full.slice(); + full.position(position); + full.limit(limit); + slice.order(ByteOrder.LITTLE_ENDIAN); + return slice; + } + } + return null; } public Instant getCreationTime() { return creationTime; } - public void setCompilationTime(Instant creationTime) { + public void setCreationTime(Instant creationTime) { this.creationTime = creationTime; } @@ -248,6 +257,12 @@ public void setComment(String comment) { this.comment = comment; } + private String defaultSignature(Instant date, String comment) { + DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMMddHHmmss", Locale.US); + return String.format("%s-%08x", formatter.format(LocalDateTime.ofInstant(date, ZoneId.systemDefault())), + comment.hashCode()); + } + public String getSignature() { return signature; } @@ -272,11 +287,11 @@ public void setReference(String reference) { this.reference = reference; } - public List getBlocks() { + public List getBlocks() { return blocks; } - public void setBlocks(List blocks) { + public void setBlocks(List blocks) { this.blocks = blocks; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinter.java index fc531bd4..d4ae61f1 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinter.java @@ -57,7 +57,7 @@ static void printDescription(String filename, PrintStream output) throws IOExcep output.printf("Reference: %s%n", desc.getReference()); output.printf("Entries total: %d%n", desc.getNumTotalEntries()); output.printf("Entries indexed: %d%n", desc.getNumIndexedEntries()); - for (Description.Block b : desc.getBlocks()) { + for (Description.BlockInfo b : desc.getBlocks()) { long start = b.getStart(); output.printf("Block %s: %d - %d%n", b.getName(), start, start + b.getSize()); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java index 8c52cf5e..423d9a2e 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java @@ -44,21 +44,21 @@ public DoubleArrayLexicon(Description description, WordIdTable wordIdTable, Word } public static DoubleArrayLexicon load(ByteBuffer bytes, Description header) { - ByteBuffer trieBuf = header.slice(bytes, Blocks.TRIE_INDEX); + ByteBuffer trieBuf = header.slice(bytes, Block.TRIE_INDEX); DoubleArray da = new DoubleArray(); IntBuffer array = trieBuf.asIntBuffer(); da.setArray(array, array.limit()); WordParameters parms; if (header.isRuntimeCosts()) { - parms = WordParameters.readWrite(header.slice(bytes, Blocks.ENTRIES)); + parms = WordParameters.readWrite(header.slice(bytes, Block.ENTRIES)); } else { - parms = WordParameters.readOnly(header.slice(bytes, Blocks.ENTRIES)); + parms = WordParameters.readOnly(header.slice(bytes, Block.ENTRIES)); } - WordIdTable idTable = new WordIdTable(header.slice(bytes, Blocks.WORD_POINTERS)); - WordInfoList infos = new WordInfoList(header.slice(bytes, Blocks.ENTRIES)); - CompactedStrings strings = new CompactedStrings(header.slice(bytes, Blocks.STRINGS).asCharBuffer()); + WordIdTable idTable = new WordIdTable(header.slice(bytes, Block.WORD_POINTERS)); + WordInfoList infos = new WordInfoList(header.slice(bytes, Block.ENTRIES)); + CompactedStrings strings = new CompactedStrings(header.slice(bytes, Block.STRINGS).asCharBuffer()); return new DoubleArrayLexicon(header, idTable, parms, infos, da, strings); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/GrammarImpl.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/GrammarImpl.java index 58fb04f5..1279eb73 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/GrammarImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/GrammarImpl.java @@ -78,12 +78,12 @@ public GrammarImpl(List posList, Connection matrix) { } public static GrammarImpl load(ByteBuffer binaryDic, Description header) { - ByteBuffer connmatBytes = header.sliceOrNull(binaryDic, Blocks.CONNECTION_MATRIX); + ByteBuffer connmatBytes = header.sliceOrNull(binaryDic, Block.CONNECTION_MATRIX); Connection matrix = null; if (connmatBytes != null) { matrix = Connection.fromByteBufferV1(connmatBytes); } - List posList = loadPosList(header.slice(binaryDic, Blocks.POS_TABLE)); + List posList = loadPosList(header.slice(binaryDic, Block.POS_TABLE)); return new GrammarImpl(posList, matrix); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java index c52ed86c..aa0c7406 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BlockLayout.java @@ -16,13 +16,12 @@ package com.worksap.nlp.sudachi.dictionary.build; -import com.worksap.nlp.sudachi.dictionary.Description; +import com.worksap.nlp.sudachi.dictionary.Description.BlockInfo; import java.io.IOException; import java.nio.channels.SeekableByteChannel; import java.util.ArrayList; import java.util.List; -import java.util.StringJoiner; /** * Output channel wrapper to write dictionary parts in block layout. Also @@ -91,39 +90,16 @@ public T block(String name, BlockHandler handler) throws IOException { long start = alignPosition(); T result = handler.apply(new BlockOutput(chan, progress)); long end = chan.position(); - info.add(new BlockInfo(name, start, end)); + info.add(new BlockInfo(name, start, end - start)); return result; } /** * Returns the summary of block written. * - * @return block information in the Description.Block format. + * @return block information list. */ - public List blocks() { - List result = new ArrayList<>(); - for (BlockInfo b : info) { - Description.Block published = new Description.Block(b.name, b.start, b.end - b.start); - result.add(published); - } - return result; - } - - private static class BlockInfo { - String name; - long start; - long end; - - public BlockInfo(String name, long start, long end) { - this.name = name; - this.start = start; - this.end = end; - } - - @Override - public String toString() { - return new StringJoiner(", ", BlockInfo.class.getSimpleName() + "[", "]").add("name='" + name + "'") - .add("start=" + start).add("end=" + end).toString(); - } + public List blocks() { + return info; } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CompiledWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CompiledWordEntry.java index dafb62a9..1a06954c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CompiledWordEntry.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CompiledWordEntry.java @@ -27,10 +27,10 @@ * Used to resolve wordref that references entry in the system dictionary * (during user dictinary build). */ -public class CompiledWordEntry implements Lookup2.Entry { +public class CompiledWordEntry implements EntryLookup.Entry { private final Lexicon lexicon; private final int wordId; - private WordInfo wiCache = null; + private WordInfo wordInfo = null; public CompiledWordEntry(Lexicon lexicon, int wordId) { this.lexicon = lexicon; @@ -38,11 +38,11 @@ public CompiledWordEntry(Lexicon lexicon, int wordId) { } private WordInfo wordInfo() { - if (wiCache != null) { - return wiCache; + if (wordInfo != null) { + return wordInfo; } - wiCache = lexicon.getWordInfo(wordId); - return wiCache; + wordInfo = lexicon.getWordInfo(wordId); + return wordInfo; } @Override diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java index 7ba87013..de0b6de4 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java @@ -17,7 +17,7 @@ package com.worksap.nlp.sudachi.dictionary.build; import com.worksap.nlp.sudachi.dictionary.BinaryDictionary; -import com.worksap.nlp.sudachi.dictionary.Blocks; +import com.worksap.nlp.sudachi.dictionary.Block; import com.worksap.nlp.sudachi.dictionary.Description; import java.io.IOException; @@ -154,8 +154,8 @@ public T comment(String comment) { * time to set * @return current object */ - public T compilationTime(Instant instant) { - description.setCompilationTime(Objects.requireNonNull(instant)); + public T creationTime(Instant instant) { + description.setCreationTime(Objects.requireNonNull(instant)); return self(); } @@ -171,9 +171,9 @@ public void build(SeekableByteChannel channel) throws IOException { BlockLayout layout = new BlockLayout(channel, progress); layout.keepBlocks(1); // keep space for the Description. if (connection.nonEmpty()) { - layout.block(Blocks.CONNECTION_MATRIX, connection::compile); + layout.block(Block.CONNECTION_MATRIX, connection::compile); } - layout.block(Blocks.POS_TABLE, pos::compile); + layout.block(Block.POS_TABLE, pos::compile); lexicon.compile(layout); description.setBlocks(layout.blocks()); description.setNumberOfEntries(lexicon.getIndexedEntries(), lexicon.getTotalEntries()); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/EntryLookup.java similarity index 96% rename from src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java rename to src/main/java/com/worksap/nlp/sudachi/dictionary/build/EntryLookup.java index d8a51f00..76b2e174 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Lookup2.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/EntryLookup.java @@ -24,7 +24,7 @@ /** * Utility to lookup entries from the list. */ -public class Lookup2 { +public class EntryLookup { public interface Entry { /** @return wordid of the entry. */ int pointer(); @@ -68,7 +68,7 @@ public String headword() { // mapping to entries that have same surfaces private final Map> bySurface; - public Lookup2(List systemEntries, List userEntries) { + public EntryLookup(List systemEntries, List userEntries) { this.systemEntries = systemEntries; this.userEntries = userEntries; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java index 28cc61e2..590e2272 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java @@ -17,7 +17,7 @@ package com.worksap.nlp.sudachi.dictionary.build; import com.worksap.nlp.dartsclone.DoubleArray; -import com.worksap.nlp.sudachi.dictionary.Blocks; +import com.worksap.nlp.sudachi.dictionary.Block; import com.worksap.nlp.sudachi.dictionary.Ints; import java.io.IOException; @@ -69,11 +69,11 @@ public int add(String key, int wordId) { * @throws IOException */ public void compile(BlockLayout layout, List notIndexed) throws IOException { - TrieData data = layout.block(Blocks.WORD_POINTERS, o -> writeWordTable(o, notIndexed)); - layout.block(Blocks.TRIE_INDEX, data::writeTrie); + TrieData data = layout.block(Block.WORD_POINTERS, o -> writeWordTable(o, notIndexed)); + layout.block(Block.TRIE_INDEX, data::writeTrie); } - private TrieData writeWordTable(BlockOutput out, List notIndexed) throws IOException { + private TrieData writeWordTable(BlockOutput out, List notIndexed) throws IOException { int size = this.elements.size(); byte[][] keys = new byte[size][]; int[] values = new int[size]; @@ -107,7 +107,7 @@ private TrieData writeWordTable(BlockOutput out, List n BufWriter buf = buffer.writer((nis + 1) * 5); buf.putVarint32(nis); int prevId = 0; - for (Lookup2.Entry e : notIndexed) { + for (EntryLookup.Entry e : notIndexed) { int wid = e.pointer(); buf.putVarint32(wid - prevId); prevId = wid; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index 12244560..fe7185e6 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -16,7 +16,7 @@ package com.worksap.nlp.sudachi.dictionary.build; -import com.worksap.nlp.sudachi.dictionary.Blocks; +import com.worksap.nlp.sudachi.dictionary.Block; import com.worksap.nlp.sudachi.dictionary.DoubleArrayLexicon; import com.worksap.nlp.sudachi.dictionary.Ints; import com.worksap.nlp.sudachi.dictionary.Lexicon; @@ -157,8 +157,8 @@ public void checkOffset(long offset) { public void compile(BlockLayout layout) throws IOException { index.compile(layout, notIndexed); // entry layout requires stringstorage to be compiled beforehand. - layout.block(Blocks.STRINGS, this::writeStrings); - layout.block(Blocks.ENTRIES, this::writeEntries); + layout.block(Block.STRINGS, this::writeStrings); + layout.block(Block.ENTRIES, this::writeEntries); } private Void writeStrings(BlockOutput blockOutput) throws IOException { @@ -172,7 +172,8 @@ private Void writeStrings(BlockOutput blockOutput) throws IOException { private Void writeEntries(BlockOutput blockOutput) throws IOException { return blockOutput.measured("Word Entries", p -> { List list = entries; - Lookup2 lookup = isUser ? new Lookup2(preloadedEntries, list) : new Lookup2(list, new ArrayList<>()); + EntryLookup lookup = isUser ? new EntryLookup(preloadedEntries, list) + : new EntryLookup(list, new ArrayList<>()); BufferedChannel buf = new BufferedChannel(blockOutput.getChannel(), WordEntryLayout.MAX_LENGTH * 4); buf.position(INITIAL_OFFSET); WordEntryLayout layout = new WordEntryLayout(lookup, strings, buf, isUser); @@ -200,7 +201,7 @@ private Void writeEntries(BlockOutput blockOutput) throws IOException { * @param lookup * @return 1 if phantom entry added, 0 otherwise */ - private int addPhantomEntries(RawWordEntry entry, List list, Lookup2 lookup) { + private int addPhantomEntries(RawWordEntry entry, List list, EntryLookup lookup) { if (entry.normalizedForm instanceof WordRef.Headword) { WordRef.Headword ref = (WordRef.Headword) entry.normalizedForm; if (lookup.byHeadword(ref.getHeadword()) != null) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java index 2d98efc7..58a28041 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java @@ -28,7 +28,7 @@ * Raw word info entry parsed from the lexicon csv. */ @SuppressWarnings("jol") -public class RawWordEntry implements Lookup2.Entry { +public class RawWordEntry implements EntryLookup.Entry { int pointer; // wordid, compressed offset of this entry in the lexicon.WordEntries String headword; String reading; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringLayout.java similarity index 90% rename from src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java rename to src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringLayout.java index 5d87fcdb..e289af0a 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringLayout.java @@ -19,6 +19,8 @@ import com.worksap.nlp.sudachi.dictionary.StringPtr; import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; import java.nio.channels.WritableByteChannel; import java.util.ArrayList; import java.util.Collections; @@ -45,7 +47,7 @@ * needed and guarding against relatively expensive checking free lists with * additional conditions. */ -public class WordLayout { +public class StringLayout { private final UnicodeBufferResizeable buffer = new UnicodeBufferResizeable(); private final ArrayList free = new ArrayList<>(); private boolean freeDirty = false; @@ -272,4 +274,33 @@ int wastedBytes() { int numSlots() { return free.size(); } + + /** Resizable byte buffer to store string */ + private class UnicodeBufferResizeable { + private ResizableBuffer buffer; + + public UnicodeBufferResizeable(int size) { + this.buffer = new ResizableBuffer(size); + } + + public UnicodeBufferResizeable() { + this(64 * 1024); + } + + /** put specified (char) range of the string to the buffer from offset */ + public void put(int offset, String data, int start, int end) { + CharBuffer chars = prepare(offset, end - start); + chars.put(data, start, end); + } + + private CharBuffer prepare(int offset, int numChars) { + ByteBuffer buf = buffer.prepare(offset * 2, numChars * 2); + return buf.asCharBuffer(); + } + + /** write specified (byte) range of the buffer to the channel */ + public void write(WritableByteChannel channel, int start, int end) throws IOException { + buffer.write(channel, start, end); + } + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java index 589aa08b..8d98390a 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringStorage.java @@ -30,7 +30,7 @@ public class StringStorage implements StringIndex { private final HashMap strings = new HashMap<>(); private final HashMap candidates = new HashMap<>(); // compacted strings layout - private final WordLayout layout = new WordLayout(); + private final StringLayout layout = new StringLayout(); /** * Add string to the storage. diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java deleted file mode 100644 index 6b8d68f5..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/UnicodeBufferResizeable.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2022 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.CharBuffer; -import java.nio.channels.WritableByteChannel; - -/** Resizable byte buffer to store string */ -public class UnicodeBufferResizeable { - private ResizableBuffer buffer; - - public UnicodeBufferResizeable(int size) { - this.buffer = new ResizableBuffer(size); - } - - public UnicodeBufferResizeable() { - this(64 * 1024); - } - - /** put specified (char) range of the string to the buffer from offset */ - public void put(int offset, String data, int start, int end) { - CharBuffer chars = prepare(offset, end - start); - chars.put(data, start, end); - } - - private CharBuffer prepare(int offset, int numChars) { - ByteBuffer buf = buffer.prepare(offset * 2, numChars * 2); - return buf.asCharBuffer(); - } - - /** write specified (byte) range of the buffer to the channel */ - public void write(WritableByteChannel channel, int start, int end) throws IOException { - buffer.write(channel, start, end); - } -} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java index d73b9f8d..74c6a70e 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java @@ -29,7 +29,7 @@ */ public class WordEntryLayout { private final StringIndex index; - private final Lookup2 lookup; + private final EntryLookup lookup; private final BufferedChannel buffer; private final boolean isUser; @@ -43,7 +43,7 @@ public class WordEntryLayout { + Byte.MAX_VALUE * Integer.BYTES * 5 // splits and synonyms + (Short.MAX_VALUE + 1) * Character.BYTES; // user data - public WordEntryLayout(Lookup2 resolver, StringIndex index, BufferedChannel buffer, boolean isUser) { + public WordEntryLayout(EntryLookup resolver, StringIndex index, BufferedChannel buffer, boolean isUser) { this.lookup = resolver; this.index = index; this.buffer = buffer; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java index 777ec0a2..0fa9583d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java @@ -30,7 +30,7 @@ */ public abstract class WordRef { /** resolve word ref into pointer (word id) using resolver. */ - public abstract int resolve(Lookup2 resolver); + public abstract int resolve(EntryLookup resolver); /** * Encode the target entry as wordref. @@ -43,7 +43,7 @@ public abstract class WordRef { * to encode * @return encoded wordref */ - public int intoWordRef(Lookup2.EntryWithFlag entry) { + public int intoWordRef(EntryLookup.EntryWithFlag entry) { return WordId.make(entry.isUser ? 1 : 0, entry.pointer()); } @@ -64,7 +64,7 @@ public int getLine() { } @Override - public int resolve(Lookup2 resolver) { + public int resolve(EntryLookup resolver) { return intoWordRef(resolver.byIndex(line, isUser)); } @@ -105,8 +105,8 @@ public String getHeadword() { } @Override - public int resolve(Lookup2 resolver) { - List entries = resolver.byHeadword(headword); + public int resolve(EntryLookup resolver) { + List entries = resolver.byHeadword(headword); return intoWordRef(entries.get(0)); } @@ -158,12 +158,12 @@ public String getReading() { } @Override - public int resolve(Lookup2 resolver) { - List entries = resolver.byHeadword(headword); + public int resolve(EntryLookup resolver) { + List entries = resolver.byHeadword(headword); if (entries == null) { throw new IllegalArgumentException("matching entry not found for the " + this.toString()); } - for (Lookup2.EntryWithFlag entry : entries) { + for (EntryLookup.EntryWithFlag entry : entries) { if (entry.matches(posId, reading)) { return intoWordRef(entry); } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DescriptionTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DescriptionTest.kt index c315aaad..5df90c1a 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DescriptionTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DescriptionTest.kt @@ -25,7 +25,7 @@ class DescriptionTest { @Test fun serialization() { val d = Description() - d.blocks = listOf(Description.Block("test", 5, 15), Description.Block("test2", 30, 25)) + d.blocks = listOf(Description.BlockInfo("test", 5, 15), Description.BlockInfo("test2", 30, 25)) d.reference = "testref" d.comment = "コメント" val chan = InMemoryChannel(4096) diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/GrammarTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/GrammarTest.kt index b999c287..7a630887 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/GrammarTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/GrammarTest.kt @@ -16,7 +16,7 @@ package com.worksap.nlp.sudachi.dictionary.build -import com.worksap.nlp.sudachi.dictionary.Blocks +import com.worksap.nlp.sudachi.dictionary.Block import com.worksap.nlp.sudachi.dictionary.Description import com.worksap.nlp.sudachi.dictionary.GrammarImpl import com.worksap.nlp.sudachi.dictionary.POS @@ -33,8 +33,8 @@ class GrammarTest { assertEquals(0, pos.getId(POS("a", "b", "c", "d", "e", "f"))) val outbuf = MemChannel() val layout = BlockLayout(outbuf) - layout.block(Blocks.POS_TABLE, pos::compile) - layout.block(Blocks.CONNECTION_MATRIX, cm::compile) + layout.block(Block.POS_TABLE, pos::compile) + layout.block(Block.CONNECTION_MATRIX, cm::compile) val description = Description() description.setBlocks(layout.blocks()) val grammar = GrammarImpl.load(outbuf.buffer(), description) @@ -81,8 +81,8 @@ class GrammarTest { val outbuf = MemChannel() val layout = BlockLayout(outbuf) - layout.block(Blocks.POS_TABLE, posTable::compile) - layout.block(Blocks.CONNECTION_MATRIX, cm::compile) + layout.block(Block.POS_TABLE, posTable::compile) + layout.block(Block.CONNECTION_MATRIX, cm::compile) val description = Description() description.setBlocks(layout.blocks()) val grammar = GrammarImpl.load(outbuf.buffer(), description) diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/WordLayoutTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringLayoutTest.kt similarity index 95% rename from src/test/java/com/worksap/nlp/sudachi/dictionary/build/WordLayoutTest.kt rename to src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringLayoutTest.kt index f95a6be4..86d8dbde 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/WordLayoutTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringLayoutTest.kt @@ -23,7 +23,7 @@ import kotlin.test.assertEquals import kotlin.test.assertNotNull import kotlin.test.assertTrue -class WordLayoutTest { +class StringLayoutTest { companion object { fun CharBuffer.read(ptr: StringPtr): String { return substring(ptr.offset, ptr.offset + ptr.length) @@ -32,7 +32,7 @@ class WordLayoutTest { @Test fun alignmentBasedPlacement() { - val layout = WordLayout() + val layout = StringLayout() val p1 = layout.add("0".repeat(25)) val p2 = layout.add("1".repeat(23)) val p3 = layout.add("2".repeat(15)) @@ -55,7 +55,7 @@ class WordLayoutTest { @Test fun alignmentPlacedPlacementLarge() { - val layout = WordLayout() + val layout = StringLayout() val ptrs = ArrayList() for (i in 0..499) { val char = 500 - i @@ -75,7 +75,7 @@ class WordLayoutTest { @Test fun alignmentPlacedPlacementHoles() { - val layout = WordLayout() + val layout = StringLayout() val ptrs = ArrayList() for (i in 0..3) { val count = 200 - 5 * i @@ -108,7 +108,7 @@ class WordLayoutTest { @Test fun coverage() { - val layout = WordLayout() + val layout = StringLayout() assertEquals(0, layout.wastedBytes()) assertEquals(0, layout.numSlots()) assertNotNull(layout.toString()) From 0c474f65145507a7ddb84735e9bfacc327d07f62 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 5 Aug 2024 18:01:13 +0900 Subject: [PATCH 64/94] merge csvfieldexception into inputfileexception --- .../dictionary/build/ConnectionMatrix.java | 6 ++--- .../dictionary/build/CsvFieldException.java | 23 ---------------- .../dictionary/build/InputFileException.java | 12 +++++++-- .../sudachi/dictionary/build/POSTable.java | 8 +++--- .../dictionary/build/RawLexiconReader.java | 26 +++++++++---------- 5 files changed, 30 insertions(+), 45 deletions(-) delete mode 100644 src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvFieldException.java diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java index 227819d0..795fe708 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java @@ -109,8 +109,8 @@ public long readEntries(InputStream data) throws IOException { } String[] cols = WHITESPACE.split(line); if (cols.length < 3) { - throw new InputFileException(reader.getLineNumber(), line, - new IllegalArgumentException("not enough entries")); + throw new InputFileException(reader.getLineNumber(), + new IllegalArgumentException(String.format("not enough entries: %s", line))); } try { @@ -119,7 +119,7 @@ public long readEntries(InputStream data) throws IOException { short cost = Short.parseShort(cols[2]); conn.setCost(left, right, cost); } catch (NumberFormatException e) { - throw new InputFileException(reader.getLineNumber(), "", e); + throw new InputFileException(reader.getLineNumber(), e); } numLines += 1; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvFieldException.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvFieldException.java deleted file mode 100644 index 5434ec94..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CsvFieldException.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 2022 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build; - -public class CsvFieldException extends IllegalArgumentException { - public CsvFieldException(String file, int line, String column, Exception cause) { - super(String.format("[%s line %d, %s]", file, line, column), cause); - } -} diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InputFileException.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InputFileException.java index 24537813..cb15432b 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InputFileException.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InputFileException.java @@ -17,7 +17,15 @@ package com.worksap.nlp.sudachi.dictionary.build; public class InputFileException extends IllegalArgumentException { - public InputFileException(int line, String s, Exception cause) { - super(String.format("line:%d %s", line, s), cause); + public InputFileException(int line, Exception cause) { + super(String.format("[line:%d]", line), cause); + } + + public InputFileException(String file, int line, Exception cause) { + super(String.format("[%s line:%d]", file, line), cause); + } + + public InputFileException(String file, int line, String column, Exception cause) { + super(String.format("[%s line:%d, %s]", file, line, column), cause); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java index 834f21b1..f093982c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java @@ -110,14 +110,14 @@ public int readEntries(InputStream data) throws IOException { String[] cols = line.split(","); if (cols.length != 6) { - throw new InputFileException(numLines, line, - new IllegalArgumentException("each POS must have 6 columns.")); + throw new InputFileException(numLines, + new IllegalArgumentException(String.format("each POS must have 6 columns: %s", line))); } int posid = getId(new POS(cols)); if (posid != baseSize + numLines) { - throw new InputFileException(numLines, line, - new IllegalArgumentException(String.format("POS already exists (%s).", posid))); + throw new InputFileException(numLines, + new IllegalArgumentException(String.format("POS already exists (%s): %s", posid, line))); } numLines += 1; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index fd12697f..525f4da0 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -105,7 +105,7 @@ private void resolveColumnLayout() throws IOException { } } if (!columnFound) { - throw new CsvFieldException(parser.getName(), 0, field, + throw new InputFileException(parser.getName(), 0, field, new IllegalArgumentException("Invalid column name")); } } @@ -114,7 +114,7 @@ private void resolveColumnLayout() throws IOException { if (column.required) { StringJoiner joiner = new StringJoiner(", ", "required columns [", "] were not present in the header"); remaining.stream().filter(c -> c.required).forEach(c -> joiner.add(c.name())); - throw new CsvFieldException(parser.getName(), 0, "", new IllegalArgumentException(joiner.toString())); + throw new InputFileException(parser.getName(), 0, "", new IllegalArgumentException(joiner.toString())); } } @@ -123,12 +123,12 @@ private void resolveColumnLayout() throws IOException { .asList(Column.POS1, Column.POS2, Column.POS3, Column.POS4, Column.POS5, Column.POS6).stream() .filter(c -> mapping[c.ordinal()] >= 0).count(); if (numPosColumnsFound != 0 && numPosColumnsFound != POS.DEPTH) { - throw new CsvFieldException(parser.getName(), 0, "POS", + throw new InputFileException(parser.getName(), 0, "POS", new IllegalArgumentException("Pos1 ~ Pos6 columns must appear as a set.")); } boolean posStrExists = numPosColumnsFound == POS.DEPTH; if (!posIdExists && !posStrExists) { - throw new CsvFieldException(parser.getName(), 0, "POS", + throw new InputFileException(parser.getName(), 0, "POS", new IllegalArgumentException("Both or either PosId column or Pos1~Pos6 columns are required.")); } } @@ -141,7 +141,7 @@ private String get(List data, Column column, boolean unescape) { } if (index < 0 || index >= data.size()) { if (column.required) { - throw new CsvFieldException(parser.getName(), parser.getRowCount(), column.name(), + throw new InputFileException(parser.getName(), parser.getRowCount(), column.name(), new IllegalArgumentException(String.format("column [%s] was not present", column.name()))); } else { return ""; @@ -158,7 +158,7 @@ private String get(List data, Column column, boolean unescape) { private String getNonEmpty(List data, Column column, boolean unescape) { String value = get(data, column, unescape); if (value.isEmpty()) { - throw new CsvFieldException(parser.getName(), parser.getRowCount(), column.name(), + throw new InputFileException(parser.getName(), parser.getRowCount(), column.name(), new IllegalArgumentException(String.format("Column %s cannot be empty", column.name()))); } return value; @@ -170,7 +170,7 @@ private short getShort(List data, Column column) { try { return Short.parseShort(value); } catch (NumberFormatException e) { - throw new CsvFieldException(parser.getName(), parser.getRowCount(), column.name(), + throw new InputFileException(parser.getName(), parser.getRowCount(), column.name(), new IllegalArgumentException(String.format("failed to parse '%s' as a short value", value))); } } @@ -183,7 +183,7 @@ private Ints getInts(List data, Column column) { } String[] parts = value.split("/"); if (parts.length > Byte.MAX_VALUE) { - throw new CsvFieldException(parser.getName(), parser.getRowCount(), column.name(), + throw new InputFileException(parser.getName(), parser.getRowCount(), column.name(), new IllegalArgumentException("int list contained more than 127 entries: " + value)); } Ints result = new Ints(parts.length); @@ -201,7 +201,7 @@ private List getWordRefs(List data, Column column, WordRef.Pars } String[] parts = value.split("/"); if (parts.length > Byte.MAX_VALUE) { - throw new CsvFieldException(parser.getName(), parser.getRowCount(), column.name(), + throw new InputFileException(parser.getName(), parser.getRowCount(), column.name(), new IllegalArgumentException("reference list contained more than 127 entries: " + value)); } List result = new ArrayList<>(parts.length); @@ -209,7 +209,7 @@ private List getWordRefs(List data, Column column, WordRef.Pars try { result.add(refParser.parse(part)); } catch (IllegalArgumentException e) { - throw new CsvFieldException(parser.getName(), parser.getRowCount(), column.name(), e); + throw new InputFileException(parser.getName(), parser.getRowCount(), column.name(), e); } } return result; @@ -222,7 +222,7 @@ private WordRef getWordRef(List data, Column column, WordRef.Parser refP try { ref = refParser.parse(value); } catch (IllegalArgumentException e) { - throw new CsvFieldException(parser.getName(), parser.getRowCount(), column.name(), e); + throw new InputFileException(parser.getName(), parser.getRowCount(), column.name(), e); } // if parsed ref seems to refering current entry, return self-reference (null), @@ -266,7 +266,7 @@ private short getPos(List data) { posStrId = posTable.getId(pos); } if (idColumnExists && strColumnExists && posId != posStrId) { - throw new CsvFieldException(parser.getName(), parser.getRowCount(), "POS", new IllegalArgumentException( + throw new InputFileException(parser.getName(), parser.getRowCount(), "POS", new IllegalArgumentException( String.format("PosId (%d) and id from Pos1-6 (%d) does not match.", posId, posStrId))); } @@ -300,7 +300,7 @@ private RawWordEntry convertEntry(List data) { try { entry.validate(); } catch (IllegalArgumentException e) { - throw new CsvFieldException(parser.getName(), parser.getRowCount(), "", e); + throw new InputFileException(parser.getName(), parser.getRowCount(), "", e); } return entry; } From a22c243a3d0c09903f8c42120c4dbbc25e84c92f Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 6 Aug 2024 09:00:56 +0900 Subject: [PATCH 65/94] fix sample code indent --- .../nlp/sudachi/DefaultInputTextPlugin.java | 2 +- .../nlp/sudachi/EditConnectionCostPlugin.java | 2 +- .../nlp/sudachi/IgnoreYomiganaPlugin.java | 2 +- .../nlp/sudachi/InhibitConnectionPlugin.java | 2 +- .../worksap/nlp/sudachi/InputTextPlugin.java | 2 +- .../nlp/sudachi/JoinKatakanaOovPlugin.java | 2 +- .../nlp/sudachi/JoinNumericPlugin.java | 2 +- .../nlp/sudachi/MeCabOovProviderPlugin.java | 2 +- .../nlp/sudachi/MorphemeFormatterPlugin.java | 2 +- .../nlp/sudachi/OovProviderPlugin.java | 2 +- .../nlp/sudachi/PathRewritePlugin.java | 2 +- .../ProlongedSoundMarkInputTextPlugin.java | 2 +- .../worksap/nlp/sudachi/RegexOovProvider.java | 4 +-- .../com/worksap/nlp/sudachi/Settings.java | 34 +++++++++---------- .../nlp/sudachi/SimpleMorphemeFormatter.java | 2 +- .../nlp/sudachi/SimpleOovProviderPlugin.java | 2 +- .../sudachi/WordSegmentationFormatter.java | 2 +- 17 files changed, 34 insertions(+), 34 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/DefaultInputTextPlugin.java b/src/main/java/com/worksap/nlp/sudachi/DefaultInputTextPlugin.java index 7c49703c..fc5eb860 100644 --- a/src/main/java/com/worksap/nlp/sudachi/DefaultInputTextPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/DefaultInputTextPlugin.java @@ -51,7 +51,7 @@ * { * "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin", * "rewriteDef" : "rewrite.def" - * } + * } * } * * diff --git a/src/main/java/com/worksap/nlp/sudachi/EditConnectionCostPlugin.java b/src/main/java/com/worksap/nlp/sudachi/EditConnectionCostPlugin.java index ad4b00a0..61dac1cc 100644 --- a/src/main/java/com/worksap/nlp/sudachi/EditConnectionCostPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/EditConnectionCostPlugin.java @@ -35,7 +35,7 @@ * { * "class" : "com.worksap.nlp.sudachi.SampleEditConnectionPlugin", * "example" : "example setting" - * } + * } * } * */ diff --git a/src/main/java/com/worksap/nlp/sudachi/IgnoreYomiganaPlugin.java b/src/main/java/com/worksap/nlp/sudachi/IgnoreYomiganaPlugin.java index 40d3c6af..5a63e718 100644 --- a/src/main/java/com/worksap/nlp/sudachi/IgnoreYomiganaPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/IgnoreYomiganaPlugin.java @@ -41,7 +41,7 @@ * "leftBrackets": ["(", "("], * "rightBrackets": [")", ")"], * "maxYomiganaLength": 4 - * } + * } * } * * diff --git a/src/main/java/com/worksap/nlp/sudachi/InhibitConnectionPlugin.java b/src/main/java/com/worksap/nlp/sudachi/InhibitConnectionPlugin.java index 23c4495d..54c4b2eb 100644 --- a/src/main/java/com/worksap/nlp/sudachi/InhibitConnectionPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/InhibitConnectionPlugin.java @@ -35,7 +35,7 @@ * { * "class" : "com.worksap.nlp.sudachi.InhibitConnectionPlugin", * "inhibitedPair" : [ [ 0, 233 ], [435, 332] ] - * } + * } * } * * diff --git a/src/main/java/com/worksap/nlp/sudachi/InputTextPlugin.java b/src/main/java/com/worksap/nlp/sudachi/InputTextPlugin.java index 609b72a4..d2d8dc4e 100644 --- a/src/main/java/com/worksap/nlp/sudachi/InputTextPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/InputTextPlugin.java @@ -34,7 +34,7 @@ * { * "class" : "com.worksap.nlp.sudachi.InputTextPlugin", * "example" : "example setting" - * } + * } * } * */ diff --git a/src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java b/src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java index cef00ef1..00706896 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java @@ -39,7 +39,7 @@ * "class" : "com.worksap.nlp.sudachi.JoinKatakanaOovPlugin", * "oovPOS" : [ "POS1", "POS2", ... ], * "minLength" : 3 - * } + * } * } * */ diff --git a/src/main/java/com/worksap/nlp/sudachi/JoinNumericPlugin.java b/src/main/java/com/worksap/nlp/sudachi/JoinNumericPlugin.java index 40a24ac2..b0d02610 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JoinNumericPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/JoinNumericPlugin.java @@ -36,7 +36,7 @@ * { * "class" : "com.worksap.nlp.sudachi.JoinNumericPlugin", * "enableNormalize" : true, - * } + * } * } * * diff --git a/src/main/java/com/worksap/nlp/sudachi/MeCabOovProviderPlugin.java b/src/main/java/com/worksap/nlp/sudachi/MeCabOovProviderPlugin.java index 59647981..fcb4c1e0 100644 --- a/src/main/java/com/worksap/nlp/sudachi/MeCabOovProviderPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/MeCabOovProviderPlugin.java @@ -41,7 +41,7 @@ * "class" : "com.worksap.nlp.sudachi.MeCabOovProviderPlugin", * "charDef" : "char.def", * "unkDef" : "unk.def" - * } + * } * } * * diff --git a/src/main/java/com/worksap/nlp/sudachi/MorphemeFormatterPlugin.java b/src/main/java/com/worksap/nlp/sudachi/MorphemeFormatterPlugin.java index a881183c..c823edfc 100644 --- a/src/main/java/com/worksap/nlp/sudachi/MorphemeFormatterPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/MorphemeFormatterPlugin.java @@ -32,7 +32,7 @@ * "class" : "com.worksap.nlp.sudachi.MorphemeFormatterPlugin", * "delimiter" : "\n", * "eos" : "\nEOS\n", - * } + * } * } * * diff --git a/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java b/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java index a74dcf70..717ff94f 100644 --- a/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java @@ -40,7 +40,7 @@ * { * "class" : "com.worksap.nlp.sudachi.OovProviderPlugin", * "example" : "example setting" - * } + * } * } * */ diff --git a/src/main/java/com/worksap/nlp/sudachi/PathRewritePlugin.java b/src/main/java/com/worksap/nlp/sudachi/PathRewritePlugin.java index ae09a34f..e761babb 100644 --- a/src/main/java/com/worksap/nlp/sudachi/PathRewritePlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/PathRewritePlugin.java @@ -38,7 +38,7 @@ * { * "class" : "com.worksap.nlp.sudachi.PathRewritePlugin", * "example" : "example setting" - * } + * } * } * */ diff --git a/src/main/java/com/worksap/nlp/sudachi/ProlongedSoundMarkInputTextPlugin.java b/src/main/java/com/worksap/nlp/sudachi/ProlongedSoundMarkInputTextPlugin.java index ed4b0b6e..5cab6c53 100644 --- a/src/main/java/com/worksap/nlp/sudachi/ProlongedSoundMarkInputTextPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/ProlongedSoundMarkInputTextPlugin.java @@ -44,7 +44,7 @@ * "class" : "com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin", "prolongedSoundMarks": ["ー", "〜", "〰"], "replacementSymbol": "ー" - * } + * } * } * * diff --git a/src/main/java/com/worksap/nlp/sudachi/RegexOovProvider.java b/src/main/java/com/worksap/nlp/sudachi/RegexOovProvider.java index 612a5972..cec83eba 100644 --- a/src/main/java/com/worksap/nlp/sudachi/RegexOovProvider.java +++ b/src/main/java/com/worksap/nlp/sudachi/RegexOovProvider.java @@ -34,7 +34,7 @@ * *

  * {@code
- *  {
+ *   {
  *      "class": "com.worksap.nlp.sudachi.RegexOovProvider",
  *      "regex": "[0-9a-z-]+",
  *      "oovPOS": [ "補助記号", "一般", "*", "*", "*", "*" ],
@@ -43,7 +43,7 @@
  *      "cost": 5000,
  *      "maxLength": 32,
  *      "boundaries": "relaxed"
- * }
+ *   }
  * }
  * 
* diff --git a/src/main/java/com/worksap/nlp/sudachi/Settings.java b/src/main/java/com/worksap/nlp/sudachi/Settings.java index 94c6b014..a645c128 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Settings.java +++ b/src/main/java/com/worksap/nlp/sudachi/Settings.java @@ -52,23 +52,23 @@ * "systemDict" : "system.dic", * "characterDefinitionFile" : "char.def", * "inputTextPlugin" : [ - * { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" } - * ], - * "oovProviderPlugin" : [ - * { - * "class" : "com.worksap.nlp.sudachi.MeCabOovProviderPlugin", - * "charDef" : "char.def", - * "unkDef" : "unk.def" - * }, - * { - * "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin", - * "oovPOSStrings" : [ "補助記号", "一般", "*", "*", "*", "*" ], - * "leftId" : 5968, - * "rightId" : 5968, - * "cost" : 3857 - * } - * ] - * } + * { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" } + * ], + * "oovProviderPlugin" : [ + * { + * "class" : "com.worksap.nlp.sudachi.MeCabOovProviderPlugin", + * "charDef" : "char.def", + * "unkDef" : "unk.def" + * }, + * { + * "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin", + * "oovPOSStrings" : [ "補助記号", "一般", "*", "*", "*", "*" ], + * "leftId" : 5968, + * "rightId" : 5968, + * "cost" : 3857 + * } + * ] + * } * } * *

diff --git a/src/main/java/com/worksap/nlp/sudachi/SimpleMorphemeFormatter.java b/src/main/java/com/worksap/nlp/sudachi/SimpleMorphemeFormatter.java index 99bcbbf7..e9ac9958 100644 --- a/src/main/java/com/worksap/nlp/sudachi/SimpleMorphemeFormatter.java +++ b/src/main/java/com/worksap/nlp/sudachi/SimpleMorphemeFormatter.java @@ -32,7 +32,7 @@ * "delimiter" : "\n", * "eos" : "\nEOS\n", * "columnDelimiter" : "\t" - * } + * } * } * * diff --git a/src/main/java/com/worksap/nlp/sudachi/SimpleOovProviderPlugin.java b/src/main/java/com/worksap/nlp/sudachi/SimpleOovProviderPlugin.java index 425c4474..ad535724 100644 --- a/src/main/java/com/worksap/nlp/sudachi/SimpleOovProviderPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/SimpleOovProviderPlugin.java @@ -37,7 +37,7 @@ * "leftId" : 5968, * "rigthId" : 5968, * "cost" : 3857 - * } + * } * } * * diff --git a/src/main/java/com/worksap/nlp/sudachi/WordSegmentationFormatter.java b/src/main/java/com/worksap/nlp/sudachi/WordSegmentationFormatter.java index 74c4dc48..78545e8d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/WordSegmentationFormatter.java +++ b/src/main/java/com/worksap/nlp/sudachi/WordSegmentationFormatter.java @@ -32,7 +32,7 @@ * "class" : "com.worksap.nlp.sudachi.SurfaceFormatter", * "delimiter" : " ", * "eos" : "\n", - * } + * } * } * * From 35b7cd43f0c20d9113fa9988252ee33ff1e6a953 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 6 Aug 2024 09:42:27 +0900 Subject: [PATCH 66/94] remove InMemorychannel --- .../nlp/sudachi/dictionary/DescriptionTest.kt | 4 +- .../dictionary/build/InMemoryChannel.java | 104 ------------------ .../sudachi/dictionary/build/MemChannel.kt | 19 ++-- .../dictionary/build/StringLayoutTest.kt | 6 +- 4 files changed, 17 insertions(+), 116 deletions(-) delete mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DescriptionTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DescriptionTest.kt index 5df90c1a..33b1d1a7 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DescriptionTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DescriptionTest.kt @@ -17,7 +17,7 @@ package com.worksap.nlp.sudachi.dictionary import com.worksap.nlp.sudachi.TestDictionary -import com.worksap.nlp.sudachi.dictionary.build.InMemoryChannel +import com.worksap.nlp.sudachi.dictionary.build.MemChannel import kotlin.test.Test import kotlin.test.assertEquals @@ -28,7 +28,7 @@ class DescriptionTest { d.blocks = listOf(Description.BlockInfo("test", 5, 15), Description.BlockInfo("test2", 30, 25)) d.reference = "testref" d.comment = "コメント" - val chan = InMemoryChannel(4096) + val chan = MemChannel(4096) d.save(chan) chan.position(0) val d2 = Description.load(chan) diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java deleted file mode 100644 index 1426e74b..00000000 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/InMemoryChannel.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2022 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary.build; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.channels.SeekableByteChannel; - -public final class InMemoryChannel implements SeekableByteChannel { - private ByteBuffer buffer; - - public InMemoryChannel() { - this(1024 * 1024); - } - - public InMemoryChannel(int size) { - buffer = ByteBuffer.allocate(size); - buffer.order(ByteOrder.LITTLE_ENDIAN); - } - - public void reserve(int needed) { - if (buffer.remaining() < needed) { - ByteBuffer old = buffer; - buffer = ByteBuffer.allocate(buffer.capacity() * 2); - buffer.order(ByteOrder.LITTLE_ENDIAN); - old.flip(); - buffer.put(old); - } - } - - @Override - public int read(ByteBuffer dst) throws IOException { - ByteBuffer src = buffer; - int position = src.position(); - dst.put(src); - int newPosition = src.position(); - return newPosition - position; - } - - @Override - public int write(ByteBuffer src) throws IOException { - reserve(src.remaining()); - int pos = buffer.position(); - buffer.put(src); - return buffer.position() - pos; - } - - @Override - public long position() throws IOException { - return buffer.position(); - } - - @Override - public SeekableByteChannel position(long newPosition) throws IOException { - assert newPosition < Integer.MAX_VALUE; - buffer.position((int) newPosition); - return this; - } - - @Override - public long size() throws IOException { - return buffer.limit(); - } - - @Override - public SeekableByteChannel truncate(long size) { - assert size < Integer.MAX_VALUE; - buffer.limit((int) size); - return this; - } - - @Override - public boolean isOpen() { - return true; - } - - @Override - public void close() throws IOException { - // always open - } - - public ByteBuffer buffer() { - ByteBuffer copy = buffer.duplicate(); - copy.position(0); - copy.limit(buffer.position()); - copy.order(ByteOrder.LITTLE_ENDIAN); - return copy; - } -} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/MemChannel.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/MemChannel.kt index 2ea911f0..e9195bb7 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/MemChannel.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/MemChannel.kt @@ -23,28 +23,33 @@ import java.nio.file.Files import java.nio.file.Path import java.nio.file.StandardOpenOption -class MemChannel : SeekableByteChannel { - private var buffer: ByteBuffer = ByteBuffer.allocate(1024 * 1024) +class MemChannel(bufSize: Int = 1024 * 1024) : SeekableByteChannel { + private var buffer: ByteBuffer = ByteBuffer.allocate(bufSize) private var size = 0L init { buffer.order(ByteOrder.LITTLE_ENDIAN) } + // always open override fun close() {} override fun isOpen(): Boolean { return true } - override fun read(p0: ByteBuffer?): Int { - throw UnsupportedOperationException() + override fun read(dst: ByteBuffer?): Int { + val src = buffer + val position = src.position() + dst!!.put(src) + val newPosition = src.position() + return newPosition - position } - override fun write(p0: ByteBuffer?): Int { - val remaining = p0!!.remaining() + override fun write(src: ByteBuffer?): Int { + val remaining = src!!.remaining() reserve(remaining) - buffer.put(p0) + buffer.put(src) val pos = buffer.position().toLong() if (pos > size) { size = pos diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringLayoutTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringLayoutTest.kt index 86d8dbde..e1a31159 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringLayoutTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/StringLayoutTest.kt @@ -39,7 +39,7 @@ class StringLayoutTest { val p4 = layout.add("3".repeat(4)) val p5 = layout.add("4".repeat(1)) val p6 = layout.add("5".repeat(2)) - val chan = InMemoryChannel() + val chan = MemChannel() layout.write(chan) val chars = chan.buffer().asCharBuffer() assertEquals("0".repeat(25), chars.read(p1)) @@ -62,7 +62,7 @@ class StringLayoutTest { val str = char.toChar().toString().repeat(char) ptrs.add(layout.add(str)) } - val chan = InMemoryChannel() + val chan = MemChannel() layout.write(chan) val chars = chan.buffer().asCharBuffer() for (i in 0..499) { @@ -87,7 +87,7 @@ class StringLayoutTest { val str = (20 + i).toChar().toString().repeat(count) ptrs.add(layout.add(str)) } - val chan = InMemoryChannel() + val chan = MemChannel() layout.write(chan) val chars = chan.buffer().asCharBuffer() for (i in 0..3) { From 2ebe2d7b0681a8d801d76ae7e988a9279eb9f773 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 6 Aug 2024 13:29:16 +0900 Subject: [PATCH 67/94] fix lexicon pos column behaviour --- .../sudachi/dictionary/build/POSTable.java | 10 +++++-- .../dictionary/build/RawLexiconReader.java | 28 +++++++++++++++---- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java index f093982c..153d6632 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java @@ -64,6 +64,11 @@ List getList() { return table; } + /** @return number of all POSs in the table. */ + int size() { + return table.size(); + } + /** * @return number of non-builtin POSs. */ @@ -90,9 +95,9 @@ public int preloadFrom(Grammar grammar) { } /** - * Load pos table from the text format. + * Load pos table from the text. Assume 6-column csv without header. * - * Assume 6-column csv without header. + * After load, set allowNewPos false and inhibit adding new POS. * * @param data * @return number read. @@ -121,6 +126,7 @@ public int readEntries(InputStream data) throws IOException { } numLines += 1; } + allowNewPos = false; return numLines; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index 525f4da0..5ed4c016 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -255,22 +255,38 @@ private short getPos(List data) { short posId = -1; short posStrId = -1; - if (idColumnExists) { + if (idColumnExists && (!strColumnExists || !get(data, Column.POS_ID, false).isEmpty())) { + // if both id/parts exist, allow empty (-1) posId = getShort(data, Column.POS_ID); + + if (posId >= posTable.size()) { + throw new InputFileException(parser.getName(), parser.getRowCount(), "POS", + new IllegalArgumentException( + String.format("POS for id %d is not present in the table.", posId))); + } } - if (strColumnExists) { + if (strColumnExists && (!idColumnExists || !get(data, Column.POS1, false).isEmpty())) { + // if both id/parts exist, allow empty (-1) POS pos = new POS( // comment for line break get(data, Column.POS1, true), get(data, Column.POS2, true), get(data, Column.POS3, true), get(data, Column.POS4, true), get(data, Column.POS5, true), get(data, Column.POS6, true)); posStrId = posTable.getId(pos); } - if (idColumnExists && strColumnExists && posId != posStrId) { - throw new InputFileException(parser.getName(), parser.getRowCount(), "POS", new IllegalArgumentException( - String.format("PosId (%d) and id from Pos1-6 (%d) does not match.", posId, posStrId))); + + if (idColumnExists && strColumnExists) { + if (posId < 0 && posStrId < 0) { + throw new InputFileException(parser.getName(), parser.getRowCount(), "POS", + new IllegalArgumentException("Both PosId and Pos1-6 are empty.")); + } + if (posId >= 0 && posStrId >= 0 && posId != posStrId) { + throw new InputFileException(parser.getName(), parser.getRowCount(), "POS", + new IllegalArgumentException( + String.format("PosId (%d) and id from Pos1-6 (%d) does not match.", posId, posStrId))); + } } - return idColumnExists ? posId : posStrId; + return posId >= 0 ? posId : posStrId; } /** convert csv row to RawWordEntry */ From a413a98022f7d854d23b24ebb5b5cde08ac1282c Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 6 Aug 2024 18:26:34 +0900 Subject: [PATCH 68/94] add tests --- .../com/worksap/nlp/sudachi/StringUtilTest.kt | 13 ++ .../dictionary/DictionaryBuilderTest.java | 112 ----------- .../dictionary/DictionaryBuilderTest.kt | 185 ++++++++++++++++++ .../DictionaryGrammarPrinterTest.kt | 47 +++++ .../worksap/nlp/sudachi/dictionary/POSTest.kt | 35 ++++ .../nlp/sudachi/dictionary/StringPtrTest.kt | 7 + .../sudachi/dictionary/build/GrammarTest.kt | 46 +---- .../sudachi/dictionary/build/MemChannel.kt | 14 +- .../sudachi/dictionary/build/POSTableTest.kt | 64 ++++++ .../dictionary/build/RawLexiconReaderTest.kt | 85 +++++++- .../sudachi/dictionary/build/headers-all.csv | 3 +- .../dictionary/build/headers-minimum.csv | 3 + src/test/resources/dict/pos.csv | 8 + 13 files changed, 463 insertions(+), 159 deletions(-) delete mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.java create mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.kt create mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinterTest.kt create mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/POSTest.kt create mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/build/POSTableTest.kt create mode 100644 src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-minimum.csv create mode 100644 src/test/resources/dict/pos.csv diff --git a/src/test/java/com/worksap/nlp/sudachi/StringUtilTest.kt b/src/test/java/com/worksap/nlp/sudachi/StringUtilTest.kt index 8c05fa72..6951b2e9 100644 --- a/src/test/java/com/worksap/nlp/sudachi/StringUtilTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/StringUtilTest.kt @@ -16,6 +16,8 @@ package com.worksap.nlp.sudachi +import com.worksap.nlp.sudachi.dictionary.build.BufWriter +import java.nio.ByteBuffer import kotlin.random.Random import kotlin.test.Test import kotlin.test.assertContentEquals @@ -73,4 +75,15 @@ class StringUtilTest { assertFailsWith { StringUtil.countUtf8Bytes("test", 0, 6) } assertFailsWith { StringUtil.countUtf8Bytes("test", 6, 0) } } + + @Test + fun readLengthPrefixed() { + val bb = ByteBuffer.allocate(32) + val w = BufWriter(bb) + + val text = "test" + w.putShortString(text) + bb.flip() + assertEquals(text, StringUtil.readLengthPrefixed(bb)) + } } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.java deleted file mode 100644 index be52c7e3..00000000 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) 2021 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary; - -import static org.hamcrest.CoreMatchers.is; -import static org.hamcrest.Matchers.contains; -import static org.hamcrest.MatcherAssert.assertThat; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.Iterator; - -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -import com.worksap.nlp.sudachi.WordId; - -public class DictionaryBuilderTest { - - @Rule - public TemporaryFolder temporaryFolder = new TemporaryFolder(); - - @Test - public void commandLine() throws IOException { - File outputFile = temporaryFolder.newFile(); - File matrixFile = temporaryFolder.newFile(); - File inputFile = temporaryFolder.newFile(); - - try (FileWriter writer = new FileWriter(matrixFile)) { - writer.write("1 1\n0 0 200\n"); - } - - try (FileWriter writer = new FileWriter(inputFile)) { - writer.write("東京都,0,0,0,東京都,名詞,固有名詞,地名,一般,*,*,ヒガシキョウト,東京都,*,B,\"東,名詞,普通名詞,一般,*,*,*,ヒガシ/2\",*,1/2,1/2\n"); - writer.write("東,-1,-1,0,東,名詞,普通名詞,一般,*,*,*,ヒガシ,ひがし,*,A,*,*,*,*\n"); - writer.write("京都,0,0,0,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*,*\n"); - } - int[] wordIds = { 4, 11, 15, 19 }; // 3 + phantom entry (ひがし) - - DictionaryBuilder.main(new String[] { "-o", outputFile.getPath(), "-m", matrixFile.getPath(), "-d", "test", - inputFile.getPath() }); - - try (BinaryDictionary dictionary = new BinaryDictionary(outputFile.getPath())) { - Description header = dictionary.getDictionaryHeader(); - assertTrue(header.isSystemDictionary()); - assertThat(header.getComment(), is("test")); - - Grammar grammar = dictionary.getGrammar(); - assertThat(grammar.getPartOfSpeechSize(), is(2)); - assertThat(grammar.getPartOfSpeechString((short) 0), contains("名詞", "固有名詞", "地名", "一般", "*", "*")); - assertThat(grammar.getPartOfSpeechString((short) 1), contains("名詞", "普通名詞", "一般", "*", "*", "*")); - assertThat(grammar.getConnectCost((short) 0, (short) 0), is((short) 200)); - - Lexicon lexicon = dictionary.getLexicon(); - assertThat(lexicon.size(), is(3)); - - // first entry - int wordId = wordIds[0]; - long params = lexicon.parameters(wordId); - assertThat(WordParameters.leftId(params), is((short) 0)); - assertThat(WordParameters.cost(params), is((short) 0)); - WordInfo info = lexicon.getWordInfo(wordId); - assertThat(lexicon.string(0, info.getSurface()), is("東京都")); - assertThat(info.getNormalizedForm(), is(WordId.make(0, wordId))); - assertThat(info.getDictionaryForm(), is(WordId.make(0, wordId))); - assertThat(lexicon.string(0, info.getReadingForm()), is("ヒガシキョウト")); - assertThat(info.getPOSId(), is((short) 0)); - assertThat(info.getAunitSplit(), is(new int[] { wordIds[1], wordIds[2] })); - assertThat(info.getBunitSplit().length, is(0)); - assertThat(info.getSynonymGroupIds(), is(new int[] { 1, 2 })); - Iterator i = lexicon.lookup("東京都".getBytes(StandardCharsets.UTF_8), 0); - assertTrue(i.hasNext()); - assertThat(i.next(), is(new int[] { wordId, "東京都".getBytes(StandardCharsets.UTF_8).length })); - assertFalse(i.hasNext()); - - // second entry - wordId = wordIds[1]; - params = lexicon.parameters(wordId); - assertThat(WordParameters.leftId(params), is((short) -1)); - assertThat(WordParameters.cost(params), is((short) 0)); - info = lexicon.getWordInfo(wordId); - assertThat(lexicon.string(0, info.getSurface()), is("東")); - assertThat(info.getNormalizedForm(), is(WordId.make(0, wordIds[3]))); - assertThat(info.getDictionaryForm(), is(WordId.make(0, wordId))); - assertThat(lexicon.string(0, info.getReadingForm()), is("ヒガシ")); - assertThat(info.getPOSId(), is((short) 1)); - assertThat(info.getAunitSplit().length, is(0)); - assertThat(info.getBunitSplit().length, is(0)); - i = lexicon.lookup("東".getBytes(StandardCharsets.UTF_8), 0); - assertFalse(i.hasNext()); - } - } -} \ No newline at end of file diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.kt new file mode 100644 index 00000000..35e81eee --- /dev/null +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.kt @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary + +import com.worksap.nlp.sudachi.WordId +import java.nio.file.Path +import kotlin.io.path.createTempDirectory +import kotlin.test.BeforeTest +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertFalse +import kotlin.test.assertTrue + +class DictionaryBuilderTest { + lateinit var tempDir: Path + + @BeforeTest + fun setup() { + tempDir = createTempDirectory() + // TestDictionary.systemDictData.writeData(tempDir.resolve("system.dic")) + // TestDictionary.userDict1Data.writeData(tempDir.resolve("user.dic")) + // Utils.copyResource(tempDir, "/unk.def") + } + + @Test + fun buildSystemDictCommandline() { + // build and load + val outputFile = tempDir.resolve("test.dic") + val matrixFile = tempDir.resolve("matrix.def") + val inputFile = tempDir.resolve("lex.csv") + + matrixFile.toFile().writeText("1 1\n0 0 200\n") + inputFile + .toFile() + .writeText( + """東京都,0,0,100,東京都,名詞,固有名詞,地名,一般,*,*,ヒガシキョウト,東京都,*,B,"東,名詞,普通名詞,一般,*,*,*,ヒガシ/2",*,1/2,1/2 +東,-1,-1,200,東,名詞,普通名詞,一般,*,*,*,ヒガシ,ひがし,*,A,*,*,*,* +京都,0,0,300,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*,*""") + val wordIds = listOf(4, 11, 15, 19) // 3 + phantom entry (ひがし) + + DictionaryBuilder.main( + arrayOf( + "-o", + outputFile.toString(), + "-m", + matrixFile.toString(), + "-d", + "test", + inputFile.toString())) + + val dictionary = BinaryDictionary(outputFile.toString()) + + // header + val header = dictionary.getDictionaryHeader() + assertTrue(header.isSystemDictionary()) + assertEquals("test", header.getComment()) + + // grammar + val grammar = dictionary.getGrammar() + assertEquals(2, grammar.getPartOfSpeechSize()) + assertEquals(POS("名詞", "固有名詞", "地名", "一般", "*", "*"), grammar.getPartOfSpeechString(0)) + assertEquals(POS("名詞", "普通名詞", "一般", "*", "*", "*"), grammar.getPartOfSpeechString(1)) + assertEquals(200, grammar.getConnectCost(0, 0)) + + // lexicon + val lexicon = dictionary.getLexicon() + assertEquals(3, lexicon.size()) + + // first entry + var wordId = wordIds[0] + var params = lexicon.parameters(wordId) + assertEquals(0, WordParameters.leftId(params)) + assertEquals(100, WordParameters.cost(params)) + var wi = lexicon.getWordInfo(wordId) + assertEquals("東京都", lexicon.string(0, wi.getSurface())) + assertEquals("ヒガシキョウト", lexicon.string(0, wi.getReadingForm())) + assertEquals(WordId.make(0, wordId), wi.getNormalizedForm()) + assertEquals(WordId.make(0, wordId), wi.getDictionaryForm()) + assertEquals(0, wi.getPOSId()) + assertEquals(listOf(wordIds[1], wordIds[2]), wi.getAunitSplit().toList()) + assertEquals(0, wi.getBunitSplit().size) + assertEquals(listOf(1, 2), wi.getSynonymGroupIds().toList()) + var bs = "東京都".toByteArray() + var itr = lexicon.lookup(bs, 0) + assertTrue(itr.hasNext()) + assertEquals(listOf(wordId, bs.size), itr.next().toList()) + assertFalse(itr.hasNext()) + + // second entry + wordId = wordIds[1] + params = lexicon.parameters(wordId) + assertEquals(-1, WordParameters.leftId(params)) + assertEquals(200, WordParameters.cost(params)) + wi = lexicon.getWordInfo(wordId) + assertEquals("東", lexicon.string(0, wi.getSurface())) + assertEquals("ヒガシ", lexicon.string(0, wi.getReadingForm())) + assertEquals(WordId.make(0, wordIds[3]), wi.getNormalizedForm()) + assertEquals(WordId.make(0, wordId), wi.getDictionaryForm()) + assertEquals(1, wi.getPOSId()) + assertEquals(0, wi.getAunitSplit().size) + assertEquals(0, wi.getBunitSplit().size) + assertEquals(0, wi.getSynonymGroupIds().size) + itr = lexicon.lookup("東".toByteArray(), 0) + assertFalse(itr.hasNext()) + } + + @Test + fun buildSystemDictCommandlineWithPos() { + // build and load + val outputFile = tempDir.resolve("test.dic") + val matrixFile = tempDir.resolve("matrix.def") + val posFile = tempDir.resolve("pos.csv") + val inputFile = tempDir.resolve("lex.csv") + + matrixFile.toFile().writeText("1 1\n0 0 200\n") + posFile.toFile().writeText("名詞,普通名詞,一般,*,*,*\n名詞,固有名詞,地名,一般,*,*\n") + inputFile + .toFile() + .writeText( + """Surface,leftId,rightId,cost,writing,posId,readingform,normalizedform,dictionaryform,mode,splitA,splitB,wordstructure,synonymgroups +東京都,0,0,100,東京都,1,ヒガシキョウト,東京都,,B,"東,名詞,普通名詞,一般,*,*,*,ヒガシ/2",,1/2,1/2 +東,-1,-1,200,東,0,ヒガシ,ひがし,,A,,,, +京都,0,0,300,京都,1,キョウト,京都,,A,,,,""") + val wordIds = listOf(4, 11, 15, 19) // 3 + phantom entry (ひがし) + + DictionaryBuilder.main( + arrayOf( + "-o", + outputFile.toString(), + "-m", + matrixFile.toString(), + "-p", + posFile.toString(), + "-d", + "test", + inputFile.toString())) + + val dictionary = BinaryDictionary(outputFile.toString()) + + // grammar + val grammar = dictionary.getGrammar() + assertEquals(2, grammar.getPartOfSpeechSize()) + assertEquals(POS("名詞", "普通名詞", "一般", "*", "*", "*"), grammar.getPartOfSpeechString(0)) + assertEquals(POS("名詞", "固有名詞", "地名", "一般", "*", "*"), grammar.getPartOfSpeechString(1)) + assertEquals(200, grammar.getConnectCost(0, 0)) + + // lexicon + val lexicon = dictionary.getLexicon() + assertEquals(3, lexicon.size()) + + // first entry + var wordId = wordIds[0] + var params = lexicon.parameters(wordId) + assertEquals(0, WordParameters.leftId(params)) + assertEquals(100, WordParameters.cost(params)) + var wi = lexicon.getWordInfo(wordId) + assertEquals("東京都", lexicon.string(0, wi.getSurface())) + assertEquals("ヒガシキョウト", lexicon.string(0, wi.getReadingForm())) + assertEquals(WordId.make(0, wordId), wi.getNormalizedForm()) + assertEquals(WordId.make(0, wordId), wi.getDictionaryForm()) + assertEquals(1, wi.getPOSId()) + assertEquals(listOf(wordIds[1], wordIds[2]), wi.getAunitSplit().toList()) + assertEquals(0, wi.getBunitSplit().size) + assertEquals(listOf(1, 2), wi.getSynonymGroupIds().toList()) + var bs = "東京都".toByteArray() + var itr = lexicon.lookup(bs, 0) + assertTrue(itr.hasNext()) + assertEquals(listOf(wordId, bs.size), itr.next().toList()) + assertFalse(itr.hasNext()) + } +} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinterTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinterTest.kt new file mode 100644 index 00000000..77021a00 --- /dev/null +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinterTest.kt @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2024 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary + +import com.worksap.nlp.sudachi.TestDictionary +import java.io.ByteArrayOutputStream +import java.io.PrintStream +import kotlin.test.Test +import kotlin.test.assertEquals + +class DictionaryGrammarPrinterTest { + @Test + fun printSystemPOSs() { + val grammar = TestDictionary.systemDict.getGrammar() + val output = ByteArrayOutputStream() + val ps = PrintStream(output) + DictionaryGrammarPrinter.printPos(grammar, ps) + val lines = output.toString().split(System.lineSeparator()) + + assertEquals(8 + 1, lines.size) // system 8 + last newline + } + + @Test + fun printUserPOSs() { + val grammar = TestDictionary.userDict1.getGrammar() + val output = ByteArrayOutputStream() + val ps = PrintStream(output) + DictionaryGrammarPrinter.printPos(grammar, ps) + val lines = output.toString().split(System.lineSeparator()) + + assertEquals(1 + 1, lines.size) // user 1 + last newline + } +} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/POSTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/POSTest.kt new file mode 100644 index 00000000..2d9a5866 --- /dev/null +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/POSTest.kt @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary + +import kotlin.test.assertFails +import org.junit.Test + +class POSTest { + @Test + fun invalidPos() { + assertFails { POS() } + assertFails { POS("1") } + assertFails { POS("1", "2") } + assertFails { POS("1", "2", "3") } + assertFails { POS("1", "2", "3", "4") } + assertFails { POS("1", "2", "3", "4", "5") } + assertFails { POS("1", "2", "3", "4", "5", null) } + assertFails { POS("1", "2", "3", "4", "5", "6", "7") } + assertFails { POS("1", "2", "3", "4", "5", "6".repeat(POS.MAX_COMPONENT_LENGTH + 1)) } + } +} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/StringPtrTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/StringPtrTest.kt index bda91a39..17a0113c 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/StringPtrTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/StringPtrTest.kt @@ -17,6 +17,7 @@ package com.worksap.nlp.sudachi.dictionary import kotlin.test.assertEquals +import kotlin.test.assertFails import kotlin.test.assertFalse import kotlin.test.assertTrue import org.junit.Test @@ -103,4 +104,10 @@ class StringPtrTest { assertFalse { StringPtr.isValid(2, 23) } assertTrue { StringPtr.isValid(4, 23) } } + + @Test + fun checkedFails() { + assertFails { StringPtr.checked(StringPtr.MAX_LENGTH + 1, 0) } + assertFails { StringPtr.checked(19 + 16, 1) } + } } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/GrammarTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/GrammarTest.kt index 7a630887..bcb80280 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/GrammarTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/GrammarTest.kt @@ -21,77 +21,49 @@ import com.worksap.nlp.sudachi.dictionary.Description import com.worksap.nlp.sudachi.dictionary.GrammarImpl import com.worksap.nlp.sudachi.dictionary.POS import kotlin.test.assertEquals -import kotlin.test.assertFails import org.junit.Test +// Grammar (ConnectionMatrix + POSTable) build test class GrammarTest { @Test fun singlePos() { val cm = ConnectionMatrix() Res("test.matrix") { cm.readEntries(it) } + val pos = POSTable() assertEquals(0, pos.getId(POS("a", "b", "c", "d", "e", "f"))) + val outbuf = MemChannel() val layout = BlockLayout(outbuf) layout.block(Block.POS_TABLE, pos::compile) layout.block(Block.CONNECTION_MATRIX, cm::compile) val description = Description() description.setBlocks(layout.blocks()) + val grammar = GrammarImpl.load(outbuf.buffer(), description) - // val out = ModelOutput(outbuf) - // pos.writeTo(out) - // cm.writeTo(out) - // val gram = GrammarImpl(outbuf.buffer(), 0) assertEquals(grammar.getPartOfSpeechString(0), POS("a", "b", "c", "d", "e", "f")) } - @Test - fun failPosData() { - val posTable = POSTable() - repeat(Short.MAX_VALUE.toInt()) { - val pos = POS("a", "b", "c", "d", "e", it.toString()) - assertEquals(posTable.getId(pos), it.toShort()) - } - assertFails { posTable.getId(POS("a", "a", "a", "a", "a", "a")) } - } - - @Test - fun invalidPos() { - assertFails { POS() } - assertFails { POS("1") } - assertFails { POS("1", "2") } - assertFails { POS("1", "2", "3") } - assertFails { POS("1", "2", "3", "4") } - assertFails { POS("1", "2", "3", "4", "5") } - assertFails { POS("1", "2", "3", "4", "5", null) } - assertFails { POS("1", "2", "3", "4", "5", "6", "7") } - assertFails { POS("1", "2", "3", "4", "5", "6".repeat(POS.MAX_COMPONENT_LENGTH + 1)) } - } - @Test fun worksWithEnormousPos() { + val cm = ConnectionMatrix() + Res("test.matrix") { cm.readEntries(it) } + val posTable = POSTable() val e = "あ".repeat(127) repeat(1024) { val pos = POS(e, e, e, e, e, it.toString()) assertEquals(posTable.getId(pos), it.toShort()) } - val cm = ConnectionMatrix() - Res("test.matrix") { cm.readEntries(it) } - val outbuf = MemChannel() + val outbuf = MemChannel() val layout = BlockLayout(outbuf) layout.block(Block.POS_TABLE, posTable::compile) layout.block(Block.CONNECTION_MATRIX, cm::compile) val description = Description() description.setBlocks(layout.blocks()) - val grammar = GrammarImpl.load(outbuf.buffer(), description) - - // val out = ModelOutput(outbuf) - // posTable.writeTo(out) - // cm.writeTo(out) - // val gram = GrammarImpl(outbuf.buffer(), 0) + val grammar = GrammarImpl.load(outbuf.buffer(), description) assertEquals(grammar.partOfSpeechSize, 1024) repeat(1024) { val pos = POS(e, e, e, e, e, it.toString()) diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/MemChannel.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/MemChannel.kt index e9195bb7..f609ab7e 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/MemChannel.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/MemChannel.kt @@ -22,10 +22,10 @@ import java.nio.channels.SeekableByteChannel import java.nio.file.Files import java.nio.file.Path import java.nio.file.StandardOpenOption +import kotlin.math.max class MemChannel(bufSize: Int = 1024 * 1024) : SeekableByteChannel { private var buffer: ByteBuffer = ByteBuffer.allocate(bufSize) - private var size = 0L init { buffer.order(ByteOrder.LITTLE_ENDIAN) @@ -51,18 +51,15 @@ class MemChannel(bufSize: Int = 1024 * 1024) : SeekableByteChannel { reserve(remaining) buffer.put(src) val pos = buffer.position().toLong() - if (pos > size) { - size = pos - } return remaining } private fun reserve(additional: Int) { - val remaining = buffer.remaining() + val remaining = buffer.capacity() - buffer.position() if (additional <= remaining) { return } - val newSize = buffer.capacity() * 2 + val newSize = max(buffer.capacity() * 2, additional + buffer.position()) val newBuf = ByteBuffer.allocate(newSize) newBuf.order(ByteOrder.LITTLE_ENDIAN) buffer.flip() @@ -80,11 +77,12 @@ class MemChannel(bufSize: Int = 1024 * 1024) : SeekableByteChannel { } override fun size(): Long { - return this.size + return this.buffer.limit().toLong() } override fun truncate(p0: Long): SeekableByteChannel { - throw UnsupportedOperationException() + this.buffer.limit(p0.toInt()) + return this } fun buffer(): ByteBuffer { diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/POSTableTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/POSTableTest.kt new file mode 100644 index 00000000..7c974b62 --- /dev/null +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/POSTableTest.kt @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary.build + +import com.worksap.nlp.sudachi.dictionary.POS +import com.worksap.nlp.sudachi.resStream +import kotlin.test.assertEquals +import kotlin.test.assertFails +import org.junit.Test + +class POSTableTest { + @Test + fun loadFromCSV() { + val posTable = POSTable() + val nRead = posTable.readEntries(resStream("/dict/pos.csv")) + + assertEquals(8, nRead) + assertEquals(nRead, posTable.ownedLength()) + + val pos = POS("名詞", "固有名詞", "地名", "一般", "*", "*") + assertEquals(5, posTable.getId(pos)) + } + + @Test + fun inhibitReadingDuplicatePos() { + val dupPoss = """名詞,普通名詞,一般,*,*,*\n名詞,普通名詞,一般,*,*,*""" + + val posTable = POSTable() + assertFails { posTable.readEntries(dupPoss.byteInputStream()) } + } + + @Test + fun inhibitNewPos() { + val posTable = POSTable() + posTable.allowNewPos = false + + val newPos = POS("a", "a", "a", "a", "a", "a") + assertFails { posTable.getId(newPos) } + } + + @Test + fun failTooManyPoss() { + val posTable = POSTable() + repeat(Short.MAX_VALUE.toInt()) { + val pos = POS("a", "b", "c", "d", "e", it.toString()) + assertEquals(posTable.getId(pos), it.toShort()) + } + assertFails { posTable.getId(POS("a", "a", "a", "a", "a", "a")) } + } +} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt index df4758a0..91122254 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt @@ -16,6 +16,7 @@ package com.worksap.nlp.sudachi.dictionary.build +import com.worksap.nlp.sudachi.dictionary.POS import com.worksap.nlp.sudachi.dictionary.StringPtr import com.worksap.nlp.sudachi.resStream import java.io.StringReader @@ -65,7 +66,19 @@ class RawLexiconReaderTest { assertNull(reader.nextEntry()) } - @Test fun headerCsvMinimumFields() {} + @Test + fun headerCsvMinimumFields() { + val reader = RawLexiconReader(csvfile("headers-minimum.csv"), POSTable(), false) + assertNotNull(reader.nextEntry()).let { e -> + assertEquals("東京都", e.headword) + assertEquals("トウキョウト", e.reading) + assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(9, false)), e.aUnitSplit) + assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(10, false)), e.bUnitSplit) + assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(11, false)), e.wordStructure) + } + assertNotNull(reader.nextEntry()) + assertNull(reader.nextEntry()) + } @Test fun headerCsvAllFields() { @@ -79,11 +92,13 @@ class RawLexiconReaderTest { assertEquals(listOf(WordRef.LineNo(6, false), WordRef.LineNo(7, false)), e.wordStructure) assertEquals("10", e.userData) } + assertNotNull(reader.nextEntry()) assertNull(reader.nextEntry()) } @Test fun failMissingRequiredEntry() { + // pos1-6 are not required (because of posId), but must be used as a set val columns = "Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure".split( ",") @@ -100,6 +115,74 @@ class RawLexiconReaderTest { } } + @Test + fun posIdColumn() { + val text = + """Surface,LeftId,RightId,Cost,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure +東京都,6,8,5320,0,トウキョウト,,,,,""" + val posTable = POSTable() + posTable.getId(POS("a", "a", "a", "a", "a", "0")) + + val reader = RawLexiconReader(csvtext(text), posTable, false) + assertNotNull(reader.nextEntry()).let { e -> assertEquals(0, e.posId) } + assertNull(reader.nextEntry()) + } + + @Test + fun failNewPosId() { + val text = + """Surface,LeftId,RightId,Cost,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure +東京都,6,8,5320,1,トウキョウト,,,,,""" + val posTable = POSTable() + posTable.getId(POS("a", "a", "a", "a", "a", "0")) + + assertFails { + val reader = RawLexiconReader(csvtext(text), posTable, false) + reader.nextEntry() + } + } + + @Test + fun posIdAndParts() { + val text = + """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,0,トウキョウト,,,,,""" + val posTable = POSTable() + posTable.getId(POS("名詞", "固有名詞", "地名", "一般", "*", "*")) + + val reader = RawLexiconReader(csvtext(text), posTable, false) + assertNotNull(reader.nextEntry()).let { e -> assertEquals(0, e.posId) } + assertNull(reader.nextEntry()) + } + + @Test + fun failPosIdAndPartsNotMatch() { + val text = + """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,0,トウキョウト,,,,,""" + val posTable = POSTable() + posTable.getId(POS("a", "a", "a", "a", "a", "0")) + + assertFails { + val reader = RawLexiconReader(csvtext(text), posTable, false) + reader.nextEntry() + } + } + + @Test + fun failPosColumnMissing() { + val text = + """Surface,LeftId,RightId,Cost,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure +東京都,6,8,5320,トウキョウト,,,,,""" + val posTable = POSTable() + posTable.getId(POS("a", "a", "a", "a", "a", "0")) + + assertFails { + val reader = RawLexiconReader(csvtext(text), posTable, false) + reader.nextEntry() + } + } + @Test fun failTooLongValue() { val oversizeWord = "a".repeat(StringPtr.MAX_LENGTH + 1) diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv index 6e48f820..d8c80a71 100644 --- a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv @@ -1,2 +1,3 @@ Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,mode,splita,splitb,splitc,wordstructure,synonymgroups,userdata -東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,B,5/9,5/10,5/11,6/7,8/9,10 \ No newline at end of file +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,B,5/9,5/10,5/11,6/7,8/9,10 +行く,4,4,5105,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,,,A,,,,,, diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-minimum.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-minimum.csv new file mode 100644 index 00000000..f2fc1b17 --- /dev/null +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-minimum.csv @@ -0,0 +1,3 @@ +Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,wordStructure +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,5/9,5/10,5/11 +行く,4,4,5105,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,,,,, diff --git a/src/test/resources/dict/pos.csv b/src/test/resources/dict/pos.csv new file mode 100644 index 00000000..79312c17 --- /dev/null +++ b/src/test/resources/dict/pos.csv @@ -0,0 +1,8 @@ +助動詞,*,*,*,助動詞-タ,終止形-一般 +助詞,接続助詞,*,*,*,* +助詞,格助詞,*,*,*,* +動詞,非自立可能,*,*,五段-カ行,終止形-一般 +動詞,非自立可能,*,*,五段-カ行,連用形-促音便 +名詞,固有名詞,地名,一般,*,* +名詞,数詞,*,*,*,* +名詞,普通名詞,一般,*,*,* From 3f99a44e691b55b0211be88129bcfe63a287a160 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 6 Aug 2024 18:28:03 +0900 Subject: [PATCH 69/94] fix for tests --- .../java/com/worksap/nlp/sudachi/WordId.java | 39 +++---------- .../sudachi/dictionary/BinaryDictionary.java | 1 - .../dictionary/DictionaryGrammarPrinter.java | 56 ++----------------- .../nlp/sudachi/dictionary/WordInfo.java | 4 +- .../sudachi/dictionary/build/BufWriter.java | 6 +- .../sudachi/dictionary/build/DicBuilder.java | 2 +- .../dictionary/build/InputFileException.java | 5 +- .../sudachi/dictionary/build/POSTable.java | 3 - .../nlp/sudachi/dictionary/build/WordRef.java | 9 +-- 9 files changed, 28 insertions(+), 97 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/WordId.java b/src/main/java/com/worksap/nlp/sudachi/WordId.java index a59832b3..d54f9ea6 100644 --- a/src/main/java/com/worksap/nlp/sudachi/WordId.java +++ b/src/main/java/com/worksap/nlp/sudachi/WordId.java @@ -120,45 +120,22 @@ public static int applyMask(int wordId, int dicIdMask) { return (wordId & MAX_WORD_ID) | dicIdMask; } - /** Override dictionary part of the word id with given dic id. */ - public static int overrideDic(int wordId, int dicId) { - return applyMask(wordId, dicIdMask(dicId)); - } - /** - * Resolve dic id to refer. + * Resolve dic id that the wordRef points to. * - * @param wordRef - * word ref taken from word entry. - * @param actualDicId - * dic id of the dict which the word entry comes from. - * @return dic id which the wordid referring to. - */ - public static int refDic(int wordRef, int actualDicId) { - // 1 if wordref refers to the entry inside same dict, 0 otherwise (i.e. refers - // to system dict entry) - boolean isReferringUser = dic(wordRef) == 1; - if (isReferringUser) { - return actualDicId; - } - return 0; // system dict id - } - - /** - * Fill flag part of word ref with actual dic id. + * Dict part of WordRef only contains a flag whether if it points to system or + * user dict. * * @param wordRef * word ref taken from word entry. * @param actualDicId * dic id of the dict which the word entry comes from. - * @return dic id which the wordid referring to. + * @return dic id that the wordref refers to. */ - public static int resolveRef(int wordRef, int actualDicId) { - boolean isReferringUser = dic(wordRef) == 1; - if (isReferringUser) { - return overrideDic(wordRef, actualDicId); - } - return wordRef; // dict part is 0 and thus no need to change. + public static int refDic(int wordRef, int actualDicId) { + // dic(wordRef) == 1 if wordref refers to the entry inside same dict, 0 + // otherwise (i.e. refers to system dict entry) + return dic(wordRef) * actualDicId; } /** @return if given word id represents OOV. */ diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/BinaryDictionary.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/BinaryDictionary.java index b8500a17..690c7fe0 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/BinaryDictionary.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/BinaryDictionary.java @@ -26,7 +26,6 @@ import com.worksap.nlp.sudachi.MMap; public class BinaryDictionary implements Closeable, DictionaryAccess { - private final ByteBuffer bytes; private final Description header; private final GrammarImpl grammar; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinter.java index c875f997..5c17ef42 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinter.java @@ -19,14 +19,6 @@ import java.io.Console; import java.io.IOException; import java.io.PrintStream; -import java.nio.file.Path; -import java.nio.file.Paths; - -import com.worksap.nlp.sudachi.PathAnchor; -import com.worksap.nlp.sudachi.Config; -import com.worksap.nlp.sudachi.DictionaryFactory; -import com.worksap.nlp.sudachi.Dictionary; -import com.worksap.nlp.sudachi.Settings; /** * A dictionary grammar printing tool. @@ -37,12 +29,7 @@ private DictionaryGrammarPrinter() { static void printUsage() { Console console = System.console(); - console.printf("usage: DictionaryGrammarPrinter [-r file] \n"); - console.printf("\t-r file\tread settings from file (overrides -s)\n"); - console.printf("\t-s string\tadditional settings (overrides -r)\n"); - console.printf("\t-p directory\troot directory of resources\n"); - console.printf("\t--systemDict file\tpath to a system dictionary (overrides everything)\n"); - console.printf("\t-u file\tpath to an additional user dictionary (appended to -s)\n"); + console.printf("usage: DictionaryGrammarPrinter files... \n"); } static void printPos(GrammarImpl grammar, PrintStream output) { @@ -56,7 +43,6 @@ static void printPos(GrammarImpl grammar, PrintStream output) { /** * Prints the contents of dictionary grammar. * - * Specify the target dictionary in the same way to SudachiCommandline. * Currently it can only print POS table. * * @param args @@ -65,48 +51,16 @@ static void printPos(GrammarImpl grammar, PrintStream output) { * if IO fails */ public static void main(String[] args) throws IOException { - PathAnchor anchor = PathAnchor.classpath().andThen(PathAnchor.none()); - Settings current = Settings.resolvedBy(anchor) - .read(DictionaryGrammarPrinter.class.getClassLoader().getResource("sudachi.json")); - Config additional = Config.empty(); - - int i; - for (i = 0; i < args.length; i++) { + for (int i = 0; i < args.length; i++) { if (args[i].equals("-h")) { printUsage(); return; - } else if (args[i].equals("-r") && i + 1 < args.length) { - Path configPath = Paths.get(args[++i]); - Path parent = configPath.getParent(); - if (parent == null) { // parent directory of file.txt unfortunately is null :( - parent = Paths.get(""); - } - PathAnchor curAnchor = PathAnchor.filesystem(parent).andThen(PathAnchor.classpath()); - additional = Config.fromFile(configPath, curAnchor).withFallback(additional); - } else if (args[i].equals("-p") && i + 1 < args.length) { - String resourcesDirectory = args[++i]; - anchor = PathAnchor.filesystem(Paths.get(resourcesDirectory)).andThen(PathAnchor.classpath()); - // first resolve wrt new directory - current = Settings.resolvedBy(anchor).withFallback(current); - } else if (args[i].equals("-s") && i + 1 < args.length) { - Config other = Config.fromJsonString(args[++i], anchor); - additional = other.withFallback(additional); - } else if (args[i].equals("-u")) { - Path resolved = anchor.resolve(args[++i]); - additional = additional.addUserDictionary(resolved); - } else if (args[i].equals("--systemDict")) { - Path resolved = anchor.resolve(args[++i]); - additional = additional.systemDictionary(resolved); - } else { - break; } - } - - Config config = additional.withFallback(Config.fromSettings(current)); - try (Dictionary dict = new DictionaryFactory().create(config)) { - GrammarImpl grammar = ((DictionaryAccess) dict).getGrammar(); + BinaryDictionary dict = new BinaryDictionary(args[i]); + GrammarImpl grammar = dict.getGrammar(); printPos(grammar, System.out); + dict.close(); } } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java index 8f6e94f4..8f5e5930 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java @@ -128,7 +128,7 @@ public void setPOSId(short posId) { /** * Returns the entry ref of the normalized form of the morpheme. The information * of the dictionary form can be gotten with - * {@link com.worksap.nlp.sudachi.WordId#resolveRef} and + * {@link com.worksap.nlp.sudachi.WordId#refDic} and * {@link Lexicon#getWordInfo}. * * @return the word ref of the normalized form of the morpheme @@ -140,7 +140,7 @@ public int getNormalizedForm() { /** * Returns the entry ref of the dictionary form of the morpheme. The information * of the dictionary form can be gotten with - * {@link com.worksap.nlp.sudachi.WordId#resolveRef} and + * {@link com.worksap.nlp.sudachi.WordId#refDic} and * {@link Lexicon#getWordInfo}. * * @return the word ref of the dictionary form of the morpheme diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java index 644a61e4..6443b2fc 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java @@ -44,8 +44,8 @@ public BufWriter putInt(int val) { return this; } - public BufWriter putLong(long x) { - buffer.putLong(x); + public BufWriter putLong(long val) { + buffer.putLong(val); return this; } @@ -105,7 +105,7 @@ public BufWriter putInts(Ints value, int length) { } /** - * Encode string which has length is shorter than Short.MAX_VALUE + * Encode string which has length shorter than Short.MAX_VALUE * * @param s * string to put in the buffer. Must be shorter than Short.MAX_VALUE. diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java index de0b6de4..235a4503 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java @@ -229,13 +229,13 @@ public System posTable(String name, IOSupplier input, long size) th if (!pos.allowNewPos) { throw new IllegalArgumentException("POS list already loaded (only single POS file is allowed)."); } - pos.allowNewPos = false; progress.startBlock(name, nanoTime(), Progress.Kind.ENTRY); int nRead; try (InputStream is = input.get()) { InputStream stream = new ProgressInputStream(is, size, progress); nRead = pos.readEntries(stream); + pos.allowNewPos = false; } progress.endBlock(nRead, nanoTime()); return this; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InputFileException.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InputFileException.java index cb15432b..a4e6be10 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InputFileException.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InputFileException.java @@ -17,15 +17,18 @@ package com.worksap.nlp.sudachi.dictionary.build; public class InputFileException extends IllegalArgumentException { + /** Exception with line number */ public InputFileException(int line, Exception cause) { super(String.format("[line:%d]", line), cause); } + /** Exception with file name and line number */ public InputFileException(String file, int line, Exception cause) { super(String.format("[%s line:%d]", file, line), cause); } + /** Exception with file name, line number and csv column name */ public InputFileException(String file, int line, String column, Exception cause) { - super(String.format("[%s line:%d, %s]", file, line, column), cause); + super(String.format("[%s line:%d, column: %s]", file, line, column), cause); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java index 153d6632..1a6e3312 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java @@ -97,8 +97,6 @@ public int preloadFrom(Grammar grammar) { /** * Load pos table from the text. Assume 6-column csv without header. * - * After load, set allowNewPos false and inhibit adding new POS. - * * @param data * @return number read. */ @@ -126,7 +124,6 @@ public int readEntries(InputStream data) throws IOException { } numLines += 1; } - allowNewPos = false; return numLines; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java index 0fa9583d..9eff2797 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java @@ -192,8 +192,6 @@ public int hashCode() { } } - private static final Pattern NUMERIC_RE = Pattern.compile("^U?\\d+$"); - /** Alias of WordRef.Parser constructor. */ public static Parser parser(POSTable posTable, boolean allowLineNo, boolean allowHeadword, boolean allowNullAsterisk) { @@ -202,6 +200,9 @@ public static Parser parser(POSTable posTable, boolean allowLineNo, boolean allo /** Parser to parse wordref from a string in the lexicon field. */ public static class Parser { + public static final char WORDREF_DELIMITER = ','; + private static final Pattern NUMERIC_RE = Pattern.compile("^U?\\d+$"); + private final POSTable posTable; private final boolean allowLineNo; private final boolean allowHeadword; @@ -227,8 +228,8 @@ public WordRef parse(String text) { return new LineNo(lineNum, isUser); } - if (StringUtil.count(text, ',') == 7) { - String[] cols = text.split(",", 8); + if (StringUtil.count(text, WORDREF_DELIMITER) == 7) { + String[] cols = text.split(String.valueOf(WORDREF_DELIMITER), 8); String headword = Unescape.unescape(cols[0]); String[] posElems = Arrays.copyOfRange(cols, 1, 7); for (int i = 0; i < POS.DEPTH; ++i) { From 19d0178744109dcb9792ace4cf4274fb114b862d Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 7 Aug 2024 14:22:47 +0900 Subject: [PATCH 70/94] rewrite double array lexicon test --- .../dictionary/DictionaryBuilderTest.kt | 3 - .../dictionary/DoubleArrayLexiconTest.java | 103 ------------- .../dictionary/DoubleArrayLexiconTest.kt | 143 ++++++++++++++++++ 3 files changed, 143 insertions(+), 106 deletions(-) delete mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.java create mode 100644 src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.kt index 35e81eee..27621281 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.kt @@ -31,9 +31,6 @@ class DictionaryBuilderTest { @BeforeTest fun setup() { tempDir = createTempDirectory() - // TestDictionary.systemDictData.writeData(tempDir.resolve("system.dic")) - // TestDictionary.userDict1Data.writeData(tempDir.resolve("user.dic")) - // Utils.copyResource(tempDir, "/unk.def") } @Test diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.java deleted file mode 100644 index 1f4452ba..00000000 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.java +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2017-2022 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi.dictionary; - -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertEquals; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import com.worksap.nlp.sudachi.TestDictionary; -import org.junit.Before; -import org.junit.Test; - -public class DoubleArrayLexiconTest { - /* - * static final int GRAMMAR_SIZE = 470; - * - * DoubleArrayLexicon lexicon; - * - * @Before public void setUp() throws IOException { ByteBuffer bytes = - * TestDictionary.INSTANCE.getSystemDictData().buffer(); DictionaryHeader header - * = new DictionaryHeader(bytes, 0); lexicon = new DoubleArrayLexicon(bytes, - * header.storageSize() + GRAMMAR_SIZE, true); } - * - * @Test public void lookup() { List results = - * iteratorToList(lexicon.lookup("東京都".getBytes(StandardCharsets.UTF_8), 0)); - * - * assertEquals(3, results.size()); assertArrayEquals(new int[] { 4, 3 }, - * results.get(0)); // 東 assertArrayEquals(new int[] { 5, 6 }, results.get(1)); - * // 東京 assertArrayEquals(new int[] { 6, 9 }, results.get(2)); // 東京都 - * - * results = - * iteratorToList(lexicon.lookup("東京都に".getBytes(StandardCharsets.UTF_8), 9)); - * assertEquals(2, results.size()); assertArrayEquals(new int[] { 1, 12 }, - * results.get(0)); // に(接続助詞) assertArrayEquals(new int[] { 2, 12 }, - * results.get(1)); // に(格助詞) - * - * results = - * iteratorToList(lexicon.lookup("あれ".getBytes(StandardCharsets.UTF_8), 0)); - * assertEquals(0, results.size()); } - * - * @Test public void parameters() { // た assertEquals(1, lexicon.getLeftId(0)); - * assertEquals(1, lexicon.getRightId(0)); assertEquals(8729, - * lexicon.getCost(0)); - * - * // 東京都 assertEquals(6, lexicon.getLeftId(6)); assertEquals(8, - * lexicon.getRightId(6)); assertEquals(5320, lexicon.getCost(6)); - * - * // 都 assertEquals(8, lexicon.getLeftId(9)); assertEquals(8, - * lexicon.getRightId(9)); assertEquals(2914, lexicon.getCost(9)); } - * - * @Test public void wordInfo() { // た WordInfo wi = lexicon.getWordInfo(0); - * assertEquals("た", wi.getSurface()); assertEquals(3, wi.getLength()); - * assertEquals(0, wi.getPOSId()); assertEquals("た", wi.getNormalizedForm()); - * assertEquals(-1, wi.getDictionaryForm()); assertEquals("た", - * wi.getDictionaryForm()); assertEquals("タ", wi.getReadingForm()); - * assertArrayEquals(new int[0], wi.getAunitSplit()); assertArrayEquals(new - * int[0], wi.getBunitSplit()); assertArrayEquals(new int[0], - * wi.getWordStructure()); - * - * // 行っ wi = lexicon.getWordInfo(8); assertEquals("行っ", wi.getSurface()); - * assertEquals("行く", wi.getNormalizedForm()); assertEquals(7, - * wi.getDictionaryForm()); assertEquals("行く", wi.getDictionaryForm()); - * - * // 東京都 wi = lexicon.getWordInfo(6); assertEquals("東京都", wi.getSurface()); - * assertArrayEquals(new int[] { 5, 9 }, wi.getAunitSplit()); - * assertArrayEquals(new int[0], wi.getBunitSplit()); assertArrayEquals(new - * int[] { 5, 9 }, wi.getWordStructure()); assertArrayEquals(new int[0], - * wi.getSynonymGroupIds()); } - * - * @Test public void wordInfoWithLongWord() { // 0123456789 * 30 WordInfo wi = - * lexicon.getWordInfo(36); assertEquals(300, wi.getSurface().length()); - * assertEquals(300, wi.getLength()); assertEquals(300, - * wi.getNormalizedForm().length()); assertEquals(-1, wi.getDictionaryForm()); - * assertEquals(300, wi.getDictionaryForm().length()); assertEquals(570, - * wi.getReadingForm().length()); } - * - * @Test public void size() { assertEquals(39, lexicon.size()); } - * - * static List iteratorToList(Iterator iterator) { List result = - * new ArrayList<>(); while (iterator.hasNext()) { result.add(iterator.next()); - * } return result; } - */ -} diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt new file mode 100644 index 00000000..aa1ee5cb --- /dev/null +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi.dictionary + +import com.worksap.nlp.sudachi.TestDictionary +import kotlin.test.BeforeTest +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertFalse + +class DoubleArrayLexiconTest { + lateinit var lexicon: DoubleArrayLexicon + lateinit var systemWordIds: Ints + + @BeforeTest + fun setup() { + val bytes = TestDictionary.systemDictData.buffer() + val desc = Description.load(bytes) + lexicon = DoubleArrayLexicon.load(bytes, desc) + + val wids = Ints(lexicon.size()) + for (ints: Ints in lexicon.wordIds(0)) { + wids.appendAll(ints) + } + wids.sort() + systemWordIds = wids + } + + fun getWordId(idx: Int): Int { + return systemWordIds.get(idx) + } + + @Test + fun iterWordIds() { + assertEquals(39, systemWordIds.length()) + for (i in 0..(systemWordIds.length() - 1)) { + lexicon.getWordInfo(systemWordIds.get(i)) + } + } + + @Test + fun lookup() { + var iter = lexicon.lookup("東京都".toByteArray(), 0) + assertEquals(listOf(getWordId(4), 3), iter.next().toList()) + assertEquals(listOf(getWordId(5), 6), iter.next().toList()) + assertEquals(listOf(getWordId(6), 9), iter.next().toList()) + assertFalse(iter.hasNext()) + + iter = lexicon.lookup("東京都に".toByteArray(), 9) + assertEquals(listOf(getWordId(1), 12), iter.next().toList()) // に(接続助詞) + assertEquals(listOf(getWordId(2), 12), iter.next().toList()) // に(格助詞) + assertFalse(iter.hasNext()) + + iter = lexicon.lookup("あれ".toByteArray(), 0) + assertFalse(iter.hasNext()) + } + + @Test + fun parameters() { + // た + var param = lexicon.parameters(getWordId(0)) + assertEquals(1, WordParameters.leftId(param)) + assertEquals(1, WordParameters.rightId(param)) + assertEquals(8729, WordParameters.cost(param)) + + // 東京都 + param = lexicon.parameters(getWordId(6)) + assertEquals(6, WordParameters.leftId(param)) + assertEquals(8, WordParameters.rightId(param)) + assertEquals(5320, WordParameters.cost(param)) + + // 都 + param = lexicon.parameters(getWordId(9)) + assertEquals(8, WordParameters.leftId(param)) + assertEquals(8, WordParameters.rightId(param)) + assertEquals(2914, WordParameters.cost(param)) + } + + @Test + fun wordInfo() { + // た + var wi = lexicon.getWordInfo(getWordId(0)) + assertEquals("た", lexicon.string(0, wi.getSurface())) + assertEquals(3, wi.getLength()) + assertEquals(0, wi.getPOSId()) + assertEquals("た", lexicon.string(0, lexicon.getWordInfo(wi.getNormalizedForm()).getSurface())) + assertEquals("た", lexicon.string(0, lexicon.getWordInfo(wi.getDictionaryForm()).getSurface())) + assertEquals("タ", lexicon.string(0, wi.getReadingForm())) + assertEquals(listOf(), wi.getAunitSplit().toList()) + assertEquals(listOf(), wi.getBunitSplit().toList()) + assertEquals(listOf(), wi.getWordStructure().toList()) + + // 行っ + wi = lexicon.getWordInfo(getWordId(8)) + assertEquals("行っ", lexicon.string(0, wi.getSurface())) + assertEquals("行く", lexicon.string(0, lexicon.getWordInfo(wi.getNormalizedForm()).getSurface())) + assertEquals("行く", lexicon.string(0, lexicon.getWordInfo(wi.getDictionaryForm()).getSurface())) + + // 東京都 + wi = lexicon.getWordInfo(getWordId(6)) + assertEquals("東京都", lexicon.string(0, wi.getSurface())) + assertEquals(listOf(getWordId(5), getWordId(9)), wi.getAunitSplit().toList()) + assertEquals(listOf(), wi.getBunitSplit().toList()) + assertEquals(listOf(getWordId(5), getWordId(9)), wi.getWordStructure().toList()) + assertEquals(listOf(), wi.getSynonymGroupIds().toList()) + } + + @Test + fun wordInfoLong() { + // 0123456789 * 30 + val wi = lexicon.getWordInfo(getWordId(36)) + val surface = lexicon.string(0, wi.getSurface()) + assertEquals(300, surface.length) + assertEquals(300, wi.getLength()) + val normalizedform = lexicon.string(0, lexicon.getWordInfo(wi.getNormalizedForm()).getSurface()) + assertEquals(300, normalizedform.length) + val dictionaryform = lexicon.string(0, lexicon.getWordInfo(wi.getDictionaryForm()).getSurface()) + assertEquals(300, dictionaryform.length) + val readingform = lexicon.string(0, wi.getReadingForm()) + assertEquals(570, readingform.length) + } + + @Test + fun size() { + assertEquals(39, lexicon.size()) + } + + @Test fun string() {} +} From 5602a25bda0e0b73cf178fcfa0fe7a30775a18ea Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 19 Aug 2024 10:47:27 +0900 Subject: [PATCH 71/94] add constant for wordref split char --- .../sudachi/dictionary/DictionaryPrinter.java | 18 ++++++++---------- .../dictionary/build/RawLexiconReader.java | 9 +++++---- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index a43a6f43..a84437c6 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -18,6 +18,8 @@ import com.worksap.nlp.sudachi.WordId; import com.worksap.nlp.sudachi.dictionary.build.Progress; +import com.worksap.nlp.sudachi.dictionary.build.RawLexiconReader; +import com.worksap.nlp.sudachi.dictionary.build.WordRef; import com.worksap.nlp.sudachi.dictionary.build.RawLexiconReader.Column; import java.io.Console; @@ -30,11 +32,6 @@ import java.util.stream.Collectors; public class DictionaryPrinter { - public static final char wordRefDelimiter = '/'; - public static final String wordRefDelimiterStr = String.valueOf(wordRefDelimiter); - public static final char wordRefJoiner = ','; - public static final String wordRefJoinerStr = String.valueOf(wordRefJoiner); - private final PrintStream output; private final Progress progress = Progress.syserr(20); @@ -176,7 +173,8 @@ String wordRef(int wordId) { parts.addAll(pos); parts.add(reading); - return String.join(wordRefJoinerStr, parts.stream().map(this::maybeEscapeRefPart).collect(Collectors.toList())); + return String.join(String.valueOf(WordRef.Parser.WORDREF_DELIMITER), + parts.stream().map(this::maybeEscapeRefPart).collect(Collectors.toList())); } /** encode word entry pointed by the wordId as WordRef.Headword. */ @@ -190,7 +188,7 @@ String wordRefHeadword(int wordId, int reference) { } String wordRefList(int[] wordIds) { - return String.join(wordRefDelimiterStr, + return String.join(String.valueOf(RawLexiconReader.LIST_DELIMITER), Arrays.stream(wordIds).boxed().map(this::wordRef).collect(Collectors.toList())); } @@ -217,12 +215,12 @@ private String maybeEscapeString(String value) { /** escape WordRef.Triple part. */ private String maybeEscapeRefPart(String value) { - boolean hasDelimiter = hasCh(value, wordRefDelimiter); - boolean hasJoiner = hasCh(value, wordRefJoiner); + boolean hasDelimiter = hasCh(value, RawLexiconReader.LIST_DELIMITER); + boolean hasJoiner = hasCh(value, WordRef.Parser.WORDREF_DELIMITER); if (!hasDelimiter && !hasJoiner) { return value; } - return unicodeEscape(value, Arrays.asList(wordRefDelimiter, wordRefJoiner)); + return unicodeEscape(value, Arrays.asList(RawLexiconReader.LIST_DELIMITER, WordRef.Parser.WORDREF_DELIMITER)); } /** escape specified chars as unicode codepoint */ diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index 5ed4c016..6ab959e4 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -47,6 +47,9 @@ public enum Column { } } + private static final Pattern INTEGER_REGEX = Pattern.compile("^-?\\d+$"); + public static final char LIST_DELIMITER = '/'; + private List cachedRow; private int[] mapping; private final CSVParser parser; @@ -70,8 +73,6 @@ public RawLexiconReader(CSVParser parser, POSTable pos, boolean user) throws IOE } } - private static final Pattern INTEGER_REGEX = Pattern.compile("^-?\\d+$"); - /** assume legacy column layout if header line is not present */ private boolean isLegacyColumnLayout() { return mapping == null; @@ -181,7 +182,7 @@ private Ints getInts(List data, Column column) { if (value == null || value.isEmpty() || "*".equals(value)) { return Ints.wrap(Ints.EMPTY_ARRAY); } - String[] parts = value.split("/"); + String[] parts = value.split(String.valueOf(LIST_DELIMITER)); if (parts.length > Byte.MAX_VALUE) { throw new InputFileException(parser.getName(), parser.getRowCount(), column.name(), new IllegalArgumentException("int list contained more than 127 entries: " + value)); @@ -199,7 +200,7 @@ private List getWordRefs(List data, Column column, WordRef.Pars if (value == null || value.isEmpty() || "*".equals(value)) { return new ArrayList<>(); } - String[] parts = value.split("/"); + String[] parts = value.split(String.valueOf(LIST_DELIMITER)); if (parts.length > Byte.MAX_VALUE) { throw new InputFileException(parser.getName(), parser.getRowCount(), column.name(), new IllegalArgumentException("reference list contained more than 127 entries: " + value)); From 1411fdaecd8995a11d76266924a9888cb08d4523 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 28 Aug 2024 11:43:50 +0900 Subject: [PATCH 72/94] add header to test dict lexicons, add read lexicon test --- .../worksap/nlp/sudachi/dictionary/Ints.java | 18 +++++ .../dictionary/build/RawLexiconReaderTest.kt | 4 + .../sudachi/dictionary/build/headers-all.csv | 6 +- .../sudachi/dictionary/build/legacy-full.csv | 2 +- src/test/resources/dict/lex.csv | 77 ++++++++++--------- src/test/resources/dict/user.csv | 9 ++- src/test/resources/dict/user2.csv | 5 +- 7 files changed, 73 insertions(+), 48 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java index 5b362d62..a1c3f1fd 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java @@ -120,6 +120,24 @@ public String toString() { return joiner.toString(); } + @Override + public boolean equals(Object other) { + if (this == other) + return true; + if (other == null || getClass() != other.getClass()) + return false; + Ints o = (Ints) other; + if (this.length != o.length) { + return false; + } + for (int i = 0; i < this.length; i++) { + if (this.get(i) != o.get(i)) { + return false; + } + } + return true; + } + /** * Make sure the internal buffer has enough capacity for the specified size. * This also increases length and they should be filled using {@code set} or diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt index 91122254..3b0c091e 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt @@ -16,6 +16,7 @@ package com.worksap.nlp.sudachi.dictionary.build +import com.worksap.nlp.sudachi.dictionary.Ints import com.worksap.nlp.sudachi.dictionary.POS import com.worksap.nlp.sudachi.dictionary.StringPtr import com.worksap.nlp.sudachi.resStream @@ -47,6 +48,7 @@ class RawLexiconReaderTest { assertEquals("東京都", e.headword) assertEquals("トウキョウト", e.reading) assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(9, false)), e.wordStructure) + assertEquals(0, e.synonymGroups.length()) assertTrue(e.cUnitSplit.isEmpty()) assertEquals("", e.userData) } @@ -60,6 +62,7 @@ class RawLexiconReaderTest { assertEquals("東京都", e.headword) assertEquals("トウキョウト", e.reading) assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(9, false)), e.wordStructure) + assertEquals(Ints.wrap(intArrayOf(6, 7)), e.synonymGroups) assertEquals(listOf(WordRef.LineNo(8, false), WordRef.LineNo(9, false)), e.cUnitSplit) assertEquals("10", e.userData) } @@ -90,6 +93,7 @@ class RawLexiconReaderTest { assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(10, false)), e.bUnitSplit) assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(11, false)), e.cUnitSplit) assertEquals(listOf(WordRef.LineNo(6, false), WordRef.LineNo(7, false)), e.wordStructure) + assertEquals(Ints.wrap(intArrayOf(8, 9)), e.synonymGroups) assertEquals("10", e.userData) } assertNotNull(reader.nextEntry()) diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv index d8c80a71..9f827c21 100644 --- a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv @@ -1,3 +1,3 @@ -Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,mode,splita,splitb,splitc,wordstructure,synonymgroups,userdata -東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,B,5/9,5/10,5/11,6/7,8/9,10 -行く,4,4,5105,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,,,A,,,,,, +Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,mode,splita,splitb,splitc,wordstructure,synonymgroups,userdata,pos_id +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,B,5/9,5/10,5/11,6/7,8/9,10, +行く,4,4,5105,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,,,A,,,,,,, diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/legacy-full.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/legacy-full.csv index 98cc00c5..caed502c 100644 --- a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/legacy-full.csv +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/legacy-full.csv @@ -1 +1 @@ -東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,*,5/9,6/7,8/9,10 \ No newline at end of file +東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,*,5/9,6/7,8/9,10, \ No newline at end of file diff --git a/src/test/resources/dict/lex.csv b/src/test/resources/dict/lex.csv index 31d43125..6fd1bbae 100644 --- a/src/test/resources/dict/lex.csv +++ b/src/test/resources/dict/lex.csv @@ -1,39 +1,40 @@ -た,1,1,8729,た,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,*,A,*,*,*,* -に,2,2,11406,に,助詞,接続助詞,*,*,*,*,ニ,に,*,A,*,*,*,* -に,3,3,4481,に,助詞,格助詞,*,*,*,*,ニ,に,*,A,*,*,*,* -京都,6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*,1/5 -東,7,7,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,*,A,*,*,*,* -東京,6,6,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* -東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,*,5/9,* -行く,4,4,5105,行く,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,* -行っ,5,5,5122,行っ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,7,A,*,*,*,* -都,8,8,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,* -アイ,7,7,4675,アイ,名詞,普通名詞,一般,*,*,*,アイ,アイ,*,A,*,*,*,* -アイウ,7,7,4675,アイウ,名詞,普通名詞,一般,*,*,*,アイウ,アイウ,*,A,*,*,*,* -アイアイウ,6,6,32766,アイウ,名詞,固有名詞,地名,一般,*,*,アイアイウ,アイアイウ,*,A,*,*,*,* -0,9,9,2478,0,名詞,数詞,*,*,*,*,ゼロ,0,*,A,*,*,*,* -1,9,9,2478,1,名詞,数詞,*,*,*,*,イチ,1,*,A,*,*,*,* -2,9,9,2478,2,名詞,数詞,*,*,*,*,ニ,2,*,A,*,*,*,* -3,9,9,2478,3,名詞,数詞,*,*,*,*,サン,3,*,A,*,*,*,* -4,9,9,2478,4,名詞,数詞,*,*,*,*,ヨン,4,*,A,*,*,*,* -5,9,9,2478,5,名詞,数詞,*,*,*,*,ゴ,5,*,A,*,*,*,* -6,9,9,2478,6,名詞,数詞,*,*,*,*,ロク,6,*,A,*,*,*,* -7,9,9,2478,7,名詞,数詞,*,*,*,*,ナナ,7,*,A,*,*,*,* -8,9,9,2478,8,名詞,数詞,*,*,*,*,ハチ,8,*,A,*,*,*,* -9,9,9,2478,9,名詞,数詞,*,*,*,*,キュウ,9,*,A,*,*,*,* -〇,9,9,2478,〇,名詞,数詞,*,*,*,*,ゼロ,〇,*,A,*,*,*,* -一,9,9,2478,一,名詞,数詞,*,*,*,*,イチ,一,*,A,*,*,*,* -二,9,9,2478,二,名詞,数詞,*,*,*,*,ニ,二,*,A,*,*,*,* -三,9,9,2478,三,名詞,数詞,*,*,*,*,サン,三,*,A,*,*,*,* -四,9,9,2478,四,名詞,数詞,*,*,*,*,ヨン,四,*,A,*,*,*,* -五,9,9,2478,五,名詞,数詞,*,*,*,*,ゴ,五,*,A,*,*,*,* -六,9,9,2478,六,名詞,数詞,*,*,*,*,ロク,六,*,A,*,*,*,* -七,9,9,2478,七,名詞,数詞,*,*,*,*,ナナ,七,*,A,*,*,*,* -八,9,9,2478,八,名詞,数詞,*,*,*,*,ハチ,八,*,A,*,*,*,* -九,9,9,2478,九,名詞,数詞,*,*,*,*,キュウ,九,*,A,*,*,*,* -六三四,6,6,0,六三四,名詞,固有名詞,地名,一般,*,*,ムサシ,六三四,*,A,*,*,*,* -いく,4,4,5105,いく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,* +Surface,LeftId,RightId,Cost,Writing,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Mode,Split_A,Split_B,WordStructure,SynonymGroups +た,1,1,8729,た,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,,A,,,, +に,2,2,11406,に,助詞,接続助詞,*,*,*,*,ニ,に,,A,,,, +に,3,3,4481,に,助詞,格助詞,*,*,*,*,ニ,に,,A,,,, +京都,6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,,A,,,,1/5 +東,7,7,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,,A,,,, +東京,6,6,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,,A,,,, +東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,,B,5/9,,5/9, +行く,4,4,5105,行く,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,,A,,,, +行っ,5,5,5122,行っ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,7,A,,,, +都,8,8,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,,A,,,, +アイ,7,7,4675,アイ,名詞,普通名詞,一般,*,*,*,アイ,アイ,,A,,,, +アイウ,7,7,4675,アイウ,名詞,普通名詞,一般,*,*,*,アイウ,アイウ,,A,,,, +アイアイウ,6,6,32766,アイウ,名詞,固有名詞,地名,一般,*,*,アイアイウ,アイアイウ,,A,,,, +0,9,9,2478,0,名詞,数詞,*,*,*,*,ゼロ,0,,A,,,, +1,9,9,2478,1,名詞,数詞,*,*,*,*,イチ,1,,A,,,, +2,9,9,2478,2,名詞,数詞,*,*,*,*,ニ,2,,A,,,, +3,9,9,2478,3,名詞,数詞,*,*,*,*,サン,3,,A,,,, +4,9,9,2478,4,名詞,数詞,*,*,*,*,ヨン,4,,A,,,, +5,9,9,2478,5,名詞,数詞,*,*,*,*,ゴ,5,,A,,,, +6,9,9,2478,6,名詞,数詞,*,*,*,*,ロク,6,,A,,,, +7,9,9,2478,7,名詞,数詞,*,*,*,*,ナナ,7,,A,,,, +8,9,9,2478,8,名詞,数詞,*,*,*,*,ハチ,8,,A,,,, +9,9,9,2478,9,名詞,数詞,*,*,*,*,キュウ,9,,A,,,, +〇,9,9,2478,〇,名詞,数詞,*,*,*,*,ゼロ,〇,,A,,,, +一,9,9,2478,一,名詞,数詞,*,*,*,*,イチ,一,,A,,,, +二,9,9,2478,二,名詞,数詞,*,*,*,*,ニ,二,,A,,,, +三,9,9,2478,三,名詞,数詞,*,*,*,*,サン,三,,A,,,, +四,9,9,2478,四,名詞,数詞,*,*,*,*,ヨン,四,,A,,,, +五,9,9,2478,五,名詞,数詞,*,*,*,*,ゴ,五,,A,,,, +六,9,9,2478,六,名詞,数詞,*,*,*,*,ロク,六,,A,,,, +七,9,9,2478,七,名詞,数詞,*,*,*,*,ナナ,七,,A,,,, +八,9,9,2478,八,名詞,数詞,*,*,*,*,ハチ,八,,A,,,, +九,9,9,2478,九,名詞,数詞,*,*,*,*,キュウ,九,,A,,,, +六三四,6,6,0,六三四,名詞,固有名詞,地名,一般,*,*,ムサシ,六三四,,A,,,, +いく,4,4,5105,いく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,,A,,,, いっ,5,5,5122,いっ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,A,*,*,*,* -012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,-9000,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,*,A,*,*,*,* -特a,8,8,2914,特a,名詞,普通名詞,一般,*,*,*,トクエー,特a,*,A,*,*,*,* -な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,*,C,11,11,*,* \ No newline at end of file +012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,-9000,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,,A,,,, +特a,8,8,2914,特a,名詞,普通名詞,一般,*,*,*,トクエー,特a,,A,,,, +な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,,C,11,11,, diff --git a/src/test/resources/dict/user.csv b/src/test/resources/dict/user.csv index a81d53c5..f35b2546 100644 --- a/src/test/resources/dict/user.csv +++ b/src/test/resources/dict/user.csv @@ -1,4 +1,5 @@ -ぴらる,8,8,-32768,ぴらる,名詞,普通名詞,一般,*,*,*,ピラル,ぴらる,*,A,*,*,*,* -府,8,8,2914,府,名詞,普通名詞,一般,*,*,*,フ,府,*,A,*,*,*,* -東京府,6,6,2816,東京府,名詞,固有名詞,地名,一般,*,*,トウキョウフ,東京府,*,B,5/U1,*,5/U1,1/3 -すだち,6,6,2816,すだち,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,すだち,*,A,*,*,*,* +Surface,LeftId,RightId,Cost,Writing,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Mode,Split_A,Split_B,WordStructure,SynonymGroups +ぴらる,8,8,-32768,ぴらる,名詞,普通名詞,一般,*,*,*,ピラル,ぴらる,,A,,,, +府,8,8,2914,府,名詞,普通名詞,一般,*,*,*,フ,府,,A,,,, +東京府,6,6,2816,東京府,名詞,固有名詞,地名,一般,*,*,トウキョウフ,東京府,,B,"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/府,名詞,普通名詞,一般,*,*,*,フ",,"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/府,名詞,普通名詞,一般,*,*,*,フ",1/3 +すだち,6,6,2816,すだち,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,すだち,,A,,,, diff --git a/src/test/resources/dict/user2.csv b/src/test/resources/dict/user2.csv index f675ddd9..52e083f4 100644 --- a/src/test/resources/dict/user2.csv +++ b/src/test/resources/dict/user2.csv @@ -1,2 +1,3 @@ -ぴさる,8,8,-32768,ぴさる,名詞,普通名詞,一般,*,*,*,ピサル,ぴさる,*,A,*,*,*,* -かぼす,6,6,2816,かぼす,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,カボス,カボス,かぼす,*,A,*,*,*,* +Surface,LeftId,RightId,Cost,Writing,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Mode,Split_A,Split_B,WordStructure,SynonymGroups +ぴさる,8,8,-32768,ぴさる,名詞,普通名詞,一般,*,*,*,ピサル,ぴさる,,A,,,, +かぼす,6,6,2816,かぼす,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,カボス,カボス,かぼす,,A,,,, From f894d93e6012bcec28724a6c56e16139fb22182c Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 28 Aug 2024 13:40:27 +0900 Subject: [PATCH 73/94] let split-c work and add test for that --- .../nlp/sudachi/JapaneseTokenizer.java | 4 +- .../nlp/sudachi/JapaneseTokenizerTest.java | 8 ++ .../dictionary/DictionaryPrinterTest.kt | 2 +- .../dictionary/DoubleArrayLexiconTest.kt | 4 +- src/test/resources/dict/lex.csv | 81 ++++++++++--------- 5 files changed, 53 insertions(+), 46 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java index fcb21822..effc858b 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java @@ -188,9 +188,7 @@ MorphemeList tokenizeSentence(Tokenizer.SplitMode mode, UTF8InputText input) { } lattice.clear(); - if (mode != Tokenizer.SplitMode.C) { - path = splitPath(path, mode); - } + path = splitPath(path, mode); if (dumpOutput != null) { dumpOutput.println("=== After rewriting:"); diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java index 1ee167c9..ad9321ad 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java @@ -371,6 +371,14 @@ public void disableEmptyMorpheme() throws IOException { assertThat(s.get(2).end(), is(1)); } + @Test + public void splitC() { + MorphemeList morphemesC = tokenizer.tokenize(Tokenizer.SplitMode.C, "東東京都"); + assertThat(morphemesC.get(0).surface(), is("東")); + assertThat(morphemesC.get(1).surface(), is("東")); + assertThat(morphemesC.get(2).surface(), is("京都")); + } + @Test public void splitAfterTokenizeCtoA() { MorphemeList morphemesC = tokenizer.tokenize(Tokenizer.SplitMode.C, "東京都"); diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt index d07d3d61..0f960308 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt @@ -52,7 +52,7 @@ class DictionaryPrinterTest { DictionaryPrinter.printDictionary(filename, null, ps) val lines = output.toString().split(System.lineSeparator()) - assertEquals(41, lines.size) // header + entries + trailing new line + assertEquals(42, lines.size) // header + entries + trailing new line assertEquals( "SURFACE,LEFT_ID,RIGHT_ID,COST,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", lines[0]) diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt index aa1ee5cb..0576e1af 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt @@ -46,7 +46,7 @@ class DoubleArrayLexiconTest { @Test fun iterWordIds() { - assertEquals(39, systemWordIds.length()) + assertEquals(40, systemWordIds.length()) for (i in 0..(systemWordIds.length() - 1)) { lexicon.getWordInfo(systemWordIds.get(i)) } @@ -136,7 +136,7 @@ class DoubleArrayLexiconTest { @Test fun size() { - assertEquals(39, lexicon.size()) + assertEquals(40, lexicon.size()) } @Test fun string() {} diff --git a/src/test/resources/dict/lex.csv b/src/test/resources/dict/lex.csv index 6fd1bbae..8dae2050 100644 --- a/src/test/resources/dict/lex.csv +++ b/src/test/resources/dict/lex.csv @@ -1,40 +1,41 @@ -Surface,LeftId,RightId,Cost,Writing,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Mode,Split_A,Split_B,WordStructure,SynonymGroups -た,1,1,8729,た,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,,A,,,, -に,2,2,11406,に,助詞,接続助詞,*,*,*,*,ニ,に,,A,,,, -に,3,3,4481,に,助詞,格助詞,*,*,*,*,ニ,に,,A,,,, -京都,6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,,A,,,,1/5 -東,7,7,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,,A,,,, -東京,6,6,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,,A,,,, -東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,,B,5/9,,5/9, -行く,4,4,5105,行く,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,,A,,,, -行っ,5,5,5122,行っ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,7,A,,,, -都,8,8,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,,A,,,, -アイ,7,7,4675,アイ,名詞,普通名詞,一般,*,*,*,アイ,アイ,,A,,,, -アイウ,7,7,4675,アイウ,名詞,普通名詞,一般,*,*,*,アイウ,アイウ,,A,,,, -アイアイウ,6,6,32766,アイウ,名詞,固有名詞,地名,一般,*,*,アイアイウ,アイアイウ,,A,,,, -0,9,9,2478,0,名詞,数詞,*,*,*,*,ゼロ,0,,A,,,, -1,9,9,2478,1,名詞,数詞,*,*,*,*,イチ,1,,A,,,, -2,9,9,2478,2,名詞,数詞,*,*,*,*,ニ,2,,A,,,, -3,9,9,2478,3,名詞,数詞,*,*,*,*,サン,3,,A,,,, -4,9,9,2478,4,名詞,数詞,*,*,*,*,ヨン,4,,A,,,, -5,9,9,2478,5,名詞,数詞,*,*,*,*,ゴ,5,,A,,,, -6,9,9,2478,6,名詞,数詞,*,*,*,*,ロク,6,,A,,,, -7,9,9,2478,7,名詞,数詞,*,*,*,*,ナナ,7,,A,,,, -8,9,9,2478,8,名詞,数詞,*,*,*,*,ハチ,8,,A,,,, -9,9,9,2478,9,名詞,数詞,*,*,*,*,キュウ,9,,A,,,, -〇,9,9,2478,〇,名詞,数詞,*,*,*,*,ゼロ,〇,,A,,,, -一,9,9,2478,一,名詞,数詞,*,*,*,*,イチ,一,,A,,,, -二,9,9,2478,二,名詞,数詞,*,*,*,*,ニ,二,,A,,,, -三,9,9,2478,三,名詞,数詞,*,*,*,*,サン,三,,A,,,, -四,9,9,2478,四,名詞,数詞,*,*,*,*,ヨン,四,,A,,,, -五,9,9,2478,五,名詞,数詞,*,*,*,*,ゴ,五,,A,,,, -六,9,9,2478,六,名詞,数詞,*,*,*,*,ロク,六,,A,,,, -七,9,9,2478,七,名詞,数詞,*,*,*,*,ナナ,七,,A,,,, -八,9,9,2478,八,名詞,数詞,*,*,*,*,ハチ,八,,A,,,, -九,9,9,2478,九,名詞,数詞,*,*,*,*,キュウ,九,,A,,,, -六三四,6,6,0,六三四,名詞,固有名詞,地名,一般,*,*,ムサシ,六三四,,A,,,, -いく,4,4,5105,いく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,,A,,,, -いっ,5,5,5122,いっ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,A,*,*,*,* -012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,-9000,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,,A,,,, -特a,8,8,2914,特a,名詞,普通名詞,一般,*,*,*,トクエー,特a,,A,,,, -な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,,C,11,11,, +Surface,LeftId,RightId,Cost,Writing,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Mode,Split_A,Split_B,Split_C,WordStructure,SynonymGroups +た,1,1,8729,た,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,,A,,,,, +に,2,2,11406,に,助詞,接続助詞,*,*,*,*,ニ,に,,A,,,,, +に,3,3,4481,に,助詞,格助詞,*,*,*,*,ニ,に,,A,,,,, +京都,6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,,A,,,,,1/5 +東,7,7,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,,A,,,,, +東京,6,6,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,,A,,,,, +東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,,B,5/9,,,5/9, +行く,4,4,5105,行く,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,,A,,,,, +行っ,5,5,5122,行っ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,7,A,,,,, +都,8,8,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,,A,,,,, +アイ,7,7,4675,アイ,名詞,普通名詞,一般,*,*,*,アイ,アイ,,A,,,,, +アイウ,7,7,4675,アイウ,名詞,普通名詞,一般,*,*,*,アイウ,アイウ,,A,,,,, +アイアイウ,6,6,32766,アイウ,名詞,固有名詞,地名,一般,*,*,アイアイウ,アイアイウ,,A,,,,, +0,9,9,2478,0,名詞,数詞,*,*,*,*,ゼロ,0,,A,,,,, +1,9,9,2478,1,名詞,数詞,*,*,*,*,イチ,1,,A,,,,, +2,9,9,2478,2,名詞,数詞,*,*,*,*,ニ,2,,A,,,,, +3,9,9,2478,3,名詞,数詞,*,*,*,*,サン,3,,A,,,,, +4,9,9,2478,4,名詞,数詞,*,*,*,*,ヨン,4,,A,,,,, +5,9,9,2478,5,名詞,数詞,*,*,*,*,ゴ,5,,A,,,,, +6,9,9,2478,6,名詞,数詞,*,*,*,*,ロク,6,,A,,,,, +7,9,9,2478,7,名詞,数詞,*,*,*,*,ナナ,7,,A,,,,, +8,9,9,2478,8,名詞,数詞,*,*,*,*,ハチ,8,,A,,,,, +9,9,9,2478,9,名詞,数詞,*,*,*,*,キュウ,9,,A,,,,, +〇,9,9,2478,〇,名詞,数詞,*,*,*,*,ゼロ,〇,,A,,,,, +一,9,9,2478,一,名詞,数詞,*,*,*,*,イチ,一,,A,,,,, +二,9,9,2478,二,名詞,数詞,*,*,*,*,ニ,二,,A,,,,, +三,9,9,2478,三,名詞,数詞,*,*,*,*,サン,三,,A,,,,, +四,9,9,2478,四,名詞,数詞,*,*,*,*,ヨン,四,,A,,,,, +五,9,9,2478,五,名詞,数詞,*,*,*,*,ゴ,五,,A,,,,, +六,9,9,2478,六,名詞,数詞,*,*,*,*,ロク,六,,A,,,,, +七,9,9,2478,七,名詞,数詞,*,*,*,*,ナナ,七,,A,,,,, +八,9,9,2478,八,名詞,数詞,*,*,*,*,ハチ,八,,A,,,,, +九,9,9,2478,九,名詞,数詞,*,*,*,*,キュウ,九,,A,,,,, +六三四,6,6,0,六三四,名詞,固有名詞,地名,一般,*,*,ムサシ,六三四,,A,,,,, +いく,4,4,5105,いく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,,A,,,,, +いっ,5,5,5122,いっ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,A,,,,, +012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,-9000,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,,A,,,,, +特a,8,8,2914,特a,名詞,普通名詞,一般,*,*,*,トクエー,特a,,A,,,,, +な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,,C,11,11,,, +東東京都,6,8,6320,東東京都,名詞,固有名詞,地名,一般,*,*,トウトウキョウト,東東京都,,C,,,4/4/3,, From b0afe068e421a87d65d98db28a6147a5e9c43d14 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Thu, 29 Aug 2024 11:15:13 +0900 Subject: [PATCH 74/94] Add pos-id column for POS csv --- .../dictionary/DictionaryGrammarPrinter.java | 17 +- .../sudachi/dictionary/build/POSTable.java | 243 ++++++++++++++++-- .../DictionaryGrammarPrinterTest.kt | 3 + .../sudachi/dictionary/build/POSTableTest.kt | 70 ++++- src/test/resources/dict/pos.csv | 17 +- 5 files changed, 318 insertions(+), 32 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinter.java index 5c17ef42..3abad3f7 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinter.java @@ -19,6 +19,11 @@ import java.io.Console; import java.io.IOException; import java.io.PrintStream; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +import com.worksap.nlp.sudachi.dictionary.build.POSTable; /** * A dictionary grammar printing tool. @@ -32,11 +37,17 @@ static void printUsage() { console.printf("usage: DictionaryGrammarPrinter files... \n"); } + static void printHeader(PrintStream output) { + List columnNames = Arrays.asList(POSTable.POSCSVReader.Column.values()).stream().map(c -> c.name()) + .collect(Collectors.toList()); + output.println(String.join(",", columnNames)); + } + static void printPos(GrammarImpl grammar, PrintStream output) { int numPos = grammar.getPartOfSpeechSize(); for (int i = 0; i < numPos; i++) { POS pos = grammar.getPartOfSpeechString((short) i); - output.println(pos.toString()); + output.println(i + "," + pos); } } @@ -59,7 +70,9 @@ public static void main(String[] args) throws IOException { BinaryDictionary dict = new BinaryDictionary(args[i]); GrammarImpl grammar = dict.getGrammar(); - printPos(grammar, System.out); + PrintStream output = System.out; + printHeader(output); + printPos(grammar, output); dict.close(); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java index 1a6e3312..5bcc32dd 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java @@ -22,6 +22,8 @@ import java.io.*; import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -35,7 +37,7 @@ public class POSTable { private final HashMap lookup = new HashMap<>(); public boolean allowNewPos = true; // number of pos loaded from the system dictionary. - private int builtin = 0; + private short builtin = 0; /** * Returns the id of given POS, updating table if it's not in the list. @@ -76,6 +78,53 @@ public int ownedLength() { return table.size() - builtin; } + /** + * Add pos at the index `id` of the table. This may creates null entry in the + * table. + * + * @param pos + * POS to add + * @param id + * pos-id (index) to add POS at + * @return pos-id + */ + private short addPosAt(POS pos, short id) { + if (!allowNewPos) { + throw new IllegalArgumentException(String.format("new POS is not allowed", pos)); + } + if (id >= MAX_POS_NUMBER) { + throw new IllegalArgumentException("id " + id + " exceeds the maximum POS number"); + } + + if (table.size() <= id) { + table.addAll(Collections.nCopies(id - table.size() + 1, null)); + } + POS current = table.get(id); + if (current != null) { + throw new IllegalArgumentException(String.format("POS already exists (%s): %s", id, current)); + } + table.set(id, pos); + lookup.put(pos, id); + return id; + } + + /** + * Assure that the table has no null entry. + * + * Should be called after addPosAt is used. + */ + private void assureNoEmptyEntry() { + List nullIndices = new ArrayList<>(); + for (int i = 0; i < table.size(); i++) { + if (table.get(i) == null) { + nullIndices.add(i); + } + } + if (!nullIndices.isEmpty()) { + throw new IllegalStateException(String.format("Missing POS-Ids found: %s", nullIndices)); + } + } + /** * Load pos table from the grammar (of the system dictionary). They are * considered as built-in pos. @@ -84,47 +133,201 @@ public int ownedLength() { * @return number read. */ public int preloadFrom(Grammar grammar) { - int partOfSpeechSize = grammar.getPartOfSpeechSize(); + if (!table.isEmpty()) { + throw new IllegalStateException("POSTable.preloadFrom must be called before any other POS is added."); + } + + short partOfSpeechSize = (short) grammar.getPartOfSpeechSize(); for (short i = 0; i < partOfSpeechSize; ++i) { POS pos = grammar.getPartOfSpeechString(i); table.add(pos); lookup.put(pos, i); } - builtin += partOfSpeechSize; + + builtin = partOfSpeechSize; return partOfSpeechSize; } /** - * Load pos table from the text. Assume 6-column csv without header. + * Load pos table from the text. + * + * Assume csv format, with POS_Id (not required) and 6 POS parts columns. * * @param data * @return number read. */ public int readEntries(InputStream data) throws IOException { - LineNumberReader reader = new LineNumberReader(new InputStreamReader(data, StandardCharsets.UTF_8)); + POSCSVReader reader = new POSCSVReader(data); int baseSize = table.size(); - int numLines = 0; - while (true) { - String line = reader.readLine(); - if (line == null) { - break; + int numAdded = 0; + POSWithId posWithId; + while ((posWithId = reader.nextPos()) != null) { + if (!reader.hasIdColumn) { + int posId = getId(posWithId.pos); + if (posId != baseSize + numAdded) { + throw new InputFileException(numAdded, new IllegalArgumentException( + String.format("POS already exists (%s): %s", posId, table.get(posId).toString()))); + } + } else { + addPosAt(posWithId.pos, posWithId.id); } + numAdded += 1; + } + assureNoEmptyEntry(); + return numAdded; + } - String[] cols = line.split(","); - if (cols.length != 6) { - throw new InputFileException(numLines, - new IllegalArgumentException(String.format("each POS must have 6 columns: %s", line))); + /** + * Data class for pos read from csv. + */ + static class POSWithId { + public POS pos; + public short id = -1; + public int sourceLine; + + POSWithId(POS pos, short id) { + this.pos = pos; + this.id = id; + } + + POSWithId(POS pos) { + this.pos = pos; + } + } + + /** + * POS CSV reader. + * + * Each row must have 6 pos parts. Pos id can be missing (filled with -1). + */ + public static class POSCSVReader { + private CSVParser parser; + private int[] columnMapping; + private List cachedRow; + public boolean hasIdColumn = true; + + public Column[] PART_COLUMNS = { Column.POS1, Column.POS2, Column.POS3, Column.POS4, Column.POS5, Column.POS6 }; + + public enum Column { + POS_ID(false), POS1(true), POS2(true), POS3(true), POS4(true), POS5(true), POS6(true); + + private final boolean required; + + Column(boolean required) { + this.required = required; } - int posid = getId(new POS(cols)); - if (posid != baseSize + numLines) { - throw new InputFileException(numLines, - new IllegalArgumentException(String.format("POS already exists (%s): %s", posid, line))); + /** + * Parse string as Column, ignoring "_" and cases. + */ + public static Column fromString(String str) { + String processed = str.replace("_", ""); + for (Column col : Column.values()) { + if (col.name().replace("_", "").equalsIgnoreCase(processed)) { + return col; + } + } + return null; + } + } + + POSCSVReader(InputStream data) throws IOException { + this.parser = new CSVParser(new InputStreamReader(data, StandardCharsets.UTF_8)); + parser.setName("POS csv"); + resolveColumnLayout(); + } + + /** + * Resolve header line. + * + * POS id column can be missing even if there are no header. + */ + private void resolveColumnLayout() throws IOException { + List row = parser.getNextRow(); + Column parsed = Column.fromString(row.get(0)); + if (parsed == null) { // no header line + this.cachedRow = row; + this.columnMapping = null; + if (row.size() == 6) { + this.columnMapping = new int[] { -1, 0, 1, 2, 3, 4, 5 }; + this.hasIdColumn = false; + } + return; + } + + List remaining = new ArrayList<>(Arrays.asList(Column.values())); + columnMapping = new int[Column.values().length]; + Arrays.fill(columnMapping, -1); + for (int colIdx = 0; colIdx < row.size(); colIdx++) { + String elem = row.get(colIdx); + parsed = Column.fromString(elem); + if (!remaining.contains(parsed)) { + throw new InputFileException(parser.getName(), 0, elem, + new IllegalArgumentException("Invalid column name")); + } + columnMapping[parsed.ordinal()] = colIdx; + remaining.remove(remaining.indexOf(parsed)); + } + for (Column col : remaining) { + if (col.required) { + throw new InputFileException(parser.getName(), 0, col.name(), + new IllegalArgumentException("Required column is missing")); + } + } + if (remaining.contains(Column.POS_ID)) { + hasIdColumn = false; + } + } + + private int getIdx(List data, Column column) { + int idx = column.ordinal(); + if (columnMapping != null) { + idx = columnMapping[idx]; + } + if ((idx < 0 && column.required) || idx >= data.size()) { + throw new InputFileException(parser.getName(), parser.getRowCount(), column.name(), + new IllegalArgumentException(String.format("column [%s] was not present", column.name()))); + } + return idx; + } + + private POS getPos(List data) { + String[] parts = new String[6]; + for (int idx = 0; idx < PART_COLUMNS.length; idx++) { + parts[idx] = data.get(getIdx(data, PART_COLUMNS[idx])); + } + return new POS(parts); + } + + private POSWithId convertRow(List data) { + POS pos = getPos(data); + int idIdx = getIdx(data, Column.POS_ID); + if (idIdx < 0) { + return new POSWithId(pos); + } + return new POSWithId(pos, Short.parseShort(data.get(idIdx))); + } + + /** + * Parse next line as a set of POS and id. + * + * returned pos-id is -1 when pos-id column is missing. + */ + POSWithId nextPos() throws IOException { + List row = cachedRow; + if (row == null) { + row = parser.getNextRow(); + } else { + cachedRow = null; + } + if (row == null) { + return null; } - numLines += 1; + POSWithId pos = convertRow(row); + pos.sourceLine = parser.getRowCount(); + return pos; } - return numLines; } /** diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinterTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinterTest.kt index 77021a00..065d81cc 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinterTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinterTest.kt @@ -32,6 +32,9 @@ class DictionaryGrammarPrinterTest { val lines = output.toString().split(System.lineSeparator()) assertEquals(8 + 1, lines.size) // system 8 + last newline + + val cols = lines.get(0).split(",") + assertEquals(7, cols.size) // id + 6 parts } @Test diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/POSTableTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/POSTableTest.kt index 7c974b62..3cc2446d 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/POSTableTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/POSTableTest.kt @@ -36,9 +36,75 @@ class POSTableTest { } @Test - fun inhibitReadingDuplicatePos() { - val dupPoss = """名詞,普通名詞,一般,*,*,*\n名詞,普通名詞,一般,*,*,*""" + fun allowNoHeaderWithPosId() { + val poss = """0,名詞,普通名詞,一般,*,*,* +1,助詞,接続助詞,*,*,*,*""" + val posTable = POSTable() + posTable.readEntries(poss.byteInputStream()) + + val pos = POS("名詞", "普通名詞", "一般", "*", "*", "*") + assertEquals(0, posTable.getId(pos)) + } + + @Test + fun allowNoHeaderWithoutPosId() { + val poss = """名詞,普通名詞,一般,*,*,* +助詞,接続助詞,*,*,*,*""" + val posTable = POSTable() + posTable.readEntries(poss.byteInputStream()) + + val pos = POS("名詞", "普通名詞", "一般", "*", "*", "*") + assertEquals(0, posTable.getId(pos)) + } + + @Test + fun allowNotOrderedColumnsWithPosid() { + val poss = """pos5,pos6,posId,pos1,pos2,pos3,pos4 +*,*,0,名詞,普通名詞,一般,* +*,*,1,助詞,接続助詞,*,*""" + val posTable = POSTable() + posTable.readEntries(poss.byteInputStream()) + + val pos = POS("名詞", "普通名詞", "一般", "*", "*", "*") + assertEquals(0, posTable.getId(pos)) + } + + @Test + fun allowNotOrderedColumnsWithoutPosid() { + val poss = """pos5,pos6,pos1,pos2,pos3,pos4 +*,*,名詞,普通名詞,一般,* +*,*,助詞,接続助詞,*,*""" + val posTable = POSTable() + posTable.readEntries(poss.byteInputStream()) + + val pos = POS("名詞", "普通名詞", "一般", "*", "*", "*") + assertEquals(0, posTable.getId(pos)) + } + @Test + fun allowNotOrderedPosIds() { + val poss = """posId,pos1,pos2,pos3,pos4,pos5,pos6 +1,名詞,普通名詞,一般,*,*,* +0,助詞,接続助詞,*,*,*,*""" + val posTable = POSTable() + posTable.readEntries(poss.byteInputStream()) + + val pos = POS("名詞", "普通名詞", "一般", "*", "*", "*") + assertEquals(1, posTable.getId(pos)) + } + + @Test + fun inhibitMissingPosId() { + val poss = """posId,pos1,pos2,pos3,pos4,pos5,pos6 +1,名詞,普通名詞,一般,*,*,*""" + val posTable = POSTable() + assertFails { posTable.readEntries(poss.byteInputStream()) } + } + + @Test + fun inhibitReadingDuplicatePos() { + val dupPoss = """名詞,普通名詞,一般,*,*,* +名詞,普通名詞,一般,*,*,*""" val posTable = POSTable() assertFails { posTable.readEntries(dupPoss.byteInputStream()) } } diff --git a/src/test/resources/dict/pos.csv b/src/test/resources/dict/pos.csv index 79312c17..4dca1a1f 100644 --- a/src/test/resources/dict/pos.csv +++ b/src/test/resources/dict/pos.csv @@ -1,8 +1,9 @@ -助動詞,*,*,*,助動詞-タ,終止形-一般 -助詞,接続助詞,*,*,*,* -助詞,格助詞,*,*,*,* -動詞,非自立可能,*,*,五段-カ行,終止形-一般 -動詞,非自立可能,*,*,五段-カ行,連用形-促音便 -名詞,固有名詞,地名,一般,*,* -名詞,数詞,*,*,*,* -名詞,普通名詞,一般,*,*,* +posId,pos1,pos2,pos3,pos4,pos5,pos6 +0,助動詞,*,*,*,助動詞-タ,終止形-一般 +1,助詞,接続助詞,*,*,*,* +2,助詞,格助詞,*,*,*,* +3,動詞,非自立可能,*,*,五段-カ行,終止形-一般 +4,動詞,非自立可能,*,*,五段-カ行,連用形-促音便 +5,名詞,固有名詞,地名,一般,*,* +6,名詞,数詞,*,*,*,* +7,名詞,普通名詞,一般,*,*,* From 73c4311a42c5a525f0fbab8393a8d5f14a9c05ad Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 30 Aug 2024 11:23:25 +0900 Subject: [PATCH 75/94] allow using posid in the lexicon wordref, and improve dict-printer --- .../sudachi/dictionary/DictionaryPrinter.java | 181 ++++++++++++------ .../dictionary/build/RawLexiconReader.java | 16 +- .../nlp/sudachi/dictionary/build/WordRef.java | 8 + .../dictionary/DictionaryPrinterTest.kt | 115 ++++++++--- .../sudachi/dictionary/build/SystemDicTest.kt | 11 ++ .../sudachi/dictionary/build/UserDicTest.kt | 30 +++ .../sudachi/dictionary/build/wordref-user.csv | 3 + .../nlp/sudachi/dictionary/build/wordref.csv | 9 + 8 files changed, 281 insertions(+), 92 deletions(-) create mode 100644 src/test/resources/com/worksap/nlp/sudachi/dictionary/build/wordref-user.csv create mode 100644 src/test/resources/com/worksap/nlp/sudachi/dictionary/build/wordref.csv diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index a84437c6..f688b5dc 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -30,17 +30,37 @@ import java.util.Iterator; import java.util.List; import java.util.stream.Collectors; +import java.util.stream.Stream; public class DictionaryPrinter { private final PrintStream output; - private final Progress progress = Progress.syserr(20); + private Progress progress = Progress.syserr(20); private final GrammarImpl grammar; private final LexiconSet lex; // sorted raw word ids taken from the target dict. private final Ints wordIds; - private DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictionary base) { + private POSMode posMode = POSMode.DEFAULT; + private WordRefMode wordRefMode = WordRefMode.DEFAULT; + + public enum POSMode { + PARTS, ID, BOTH; + + public static POSMode DEFAULT = PARTS; + } + + public enum WordRefMode { + TRIPLE_PARTS, TRIPLE_ID; + + public static WordRefMode DEFAULT = TRIPLE_PARTS; + } + + DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictionary base) { + if (dic.getDictionaryHeader().isUserDictionary() && base == null) { + throw new IllegalArgumentException("System dictionary is required to print user dictionary"); + } + this.output = output; if (base == null) { @@ -67,22 +87,53 @@ private DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictio wordIds = allIds; } + DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictionary base, POSMode posMode, + WordRefMode wordRefMode) { + this(output, dic, base); + this.posMode = posMode; + this.wordRefMode = wordRefMode; + } + + void setProgress(Progress progress) { + this.progress = progress; + } + static void printUsage() { Console console = System.console(); - console.printf("usage: PrintDictionary [-s file] file\n"); - console.printf("\t-s file\tsystem dictionary\n"); + console.printf("usage: PrintDictionary [-s file] [--posMode mode] [--wordRefMode mode] file\n"); + console.printf("\t-s file\tsystem dictionary. required to print user dictionary.\n"); + console.printf("\t--posMode [PARTS, ID, BOTH]\tprint specified POS column (default PARTS).\n"); + console.printf( + "\t--wordRefMode [TRIPLE_PARTS, TRIPLE_ID]\tprint word-reference in specified format (default TRIPLE_PARTS).\n"); + } + + void printDictionary() { + printHeader(); + printEntries(); } void printHeader() { - // @formatter:off - printColumnHeaders(Column.SURFACE, Column.LEFT_ID, Column.RIGHT_ID, Column.COST, Column.POS1, Column.POS2, - Column.POS3, Column.POS4, Column.POS5, Column.POS6, Column.READING_FORM, Column.NORMALIZED_FORM, - Column.DICTIONARY_FORM, Column.SPLIT_A, Column.SPLIT_B, Column.SPLIT_C, Column.WORD_STRUCTURE, - Column.SYNONYM_GROUPS, Column.USER_DATA); - // @formatter:on + List posColumns; + if (posMode == POSMode.PARTS) { + posColumns = Arrays.asList(Column.POS1, Column.POS2, Column.POS3, Column.POS4, Column.POS5, Column.POS6); + } else if (posMode == POSMode.ID) { + posColumns = Arrays.asList(Column.POS_ID); + } else { // BOTH + posColumns = Arrays.asList(Column.POS_ID, Column.POS1, Column.POS2, Column.POS3, Column.POS4, Column.POS5, + Column.POS6); + } + + List headerColumns = Stream + .of(Arrays.asList(Column.SURFACE, Column.LEFT_ID, Column.RIGHT_ID, Column.COST), posColumns, + Arrays.asList(Column.READING_FORM, Column.NORMALIZED_FORM, Column.DICTIONARY_FORM, + Column.SPLIT_A, Column.SPLIT_B, Column.SPLIT_C, Column.WORD_STRUCTURE, + Column.SYNONYM_GROUPS, Column.USER_DATA)) + .flatMap(l -> l.stream()).collect(Collectors.toList()); + + printColumnHeaders(headerColumns); } - void printColumnHeaders(Column... headers) { + void printColumnHeaders(List headers) { boolean isFirst = true; for (Column c : headers) { if (isFirst) { @@ -98,7 +149,7 @@ void printColumnHeaders(Column... headers) { private void printEntries() { progress.startBlock("Entries", System.nanoTime(), Progress.Kind.ENTRY); long size = wordIds.length(); - for (int i = 0; i < wordIds.length(); ++i) { + for (int i = 0; i < size; ++i) { printEntry(wordIds.get(i)); progress.progress(i, size); } @@ -108,7 +159,8 @@ private void printEntries() { void printEntry(int wordId) { int dic = WordId.dic(wordId); WordInfo info = lex.getWordInfo(wordId); - POS pos = grammar.getPartOfSpeechString(info.getPOSId()); + short posId = info.getPOSId(); + POS pos = grammar.getPartOfSpeechString(posId); long params = lex.parameters(wordId); short leftId = WordParameters.leftId(params); short rightId = WordParameters.rightId(params); @@ -117,12 +169,17 @@ void printEntry(int wordId) { field(leftId); field(rightId); field(cost); - field(pos.get(0)); - field(pos.get(1)); - field(pos.get(2)); - field(pos.get(3)); - field(pos.get(4)); - field(pos.get(5)); + if (posMode == POSMode.ID || posMode == POSMode.BOTH) { + field(posId); + } + if (posMode == POSMode.PARTS || posMode == POSMode.BOTH) { + field(pos.get(0)); + field(pos.get(1)); + field(pos.get(2)); + field(pos.get(3)); + field(pos.get(4)); + field(pos.get(5)); + } field(lex.string(dic, info.getReadingForm())); field(wordRefHeadword(info.getNormalizedForm(), wordId)); field(wordRef(info.getDictionaryForm(), wordId)); @@ -163,15 +220,21 @@ String wordRef(int wordId, int reference) { /** encode word entry pointed by the wordId as WordRef.Triple. */ String wordRef(int wordId) { WordInfo info = lex.getWordInfo(wordId); - POS pos = grammar.getPartOfSpeechString(info.getPOSId()); int dic = WordId.dic(wordId); String surface = lex.string(dic, info.getSurface()); + short posId = info.getPOSId(); String reading = lex.string(dic, info.getReadingForm()); - List parts = new ArrayList<>(1 + POS.DEPTH + 1); - parts.add(surface); - parts.addAll(pos); - parts.add(reading); + List parts; + if (wordRefMode == WordRefMode.TRIPLE_ID) { + parts = Arrays.asList(surface, String.valueOf(posId), reading); + } else { + POS pos = grammar.getPartOfSpeechString(posId); + parts = new ArrayList<>(1 + POS.DEPTH + 1); + parts.add(surface); + parts.addAll(pos); + parts.add(reading); + } return String.join(String.valueOf(WordRef.Parser.WORDREF_DELIMITER), parts.stream().map(this::maybeEscapeRefPart).collect(Collectors.toList())); @@ -258,27 +321,6 @@ static String splitToString(int[] split) { } } - static void printDictionary(String filename, BinaryDictionary systemDict, PrintStream output) throws IOException { - try (BinaryDictionary dictionary = new BinaryDictionary(filename)) { - DictionaryPrinter dp; - if (dictionary.getDictionaryHeader().isUserDictionary()) { - if (systemDict == null) { - throw new IllegalArgumentException( - "System dictionary (`-s` option) is required to print user dictionary: " + filename); - } - dp = new DictionaryPrinter(output, dictionary, systemDict); - } else if (dictionary.getDictionaryHeader().isSystemDictionary()) { - dp = new DictionaryPrinter(output, dictionary, null); - } else { - // should not happen - throw new IllegalStateException("Invalid dictionary"); - } - - dp.printHeader(); - dp.printEntries(); - } - } - /** * Prints the contents of dictionary. * @@ -294,29 +336,44 @@ static void printDictionary(String filename, BinaryDictionary systemDict, PrintS * This tool requires the system dictionary when it dumps an user dictionary. * * @param args - * the option and the input filename + * the option and the input filename * @throws IOException - * if IO + * if IO */ public static void main(String[] args) throws IOException { - BinaryDictionary systemDict = null; - - try { - int i = 0; - for (i = 0; i < args.length; i++) { - if (args[i].equals("-s") && i + 1 < args.length) { - systemDict = BinaryDictionary.loadSystem(args[++i]); - } else if (args[i].equals("-h")) { - printUsage(); - return; - } else { - break; - } + String systemDictPath = null; + POSMode posMode = POSMode.PARTS; + WordRefMode wordRefMode = WordRefMode.TRIPLE_PARTS; + + int i = 0; + for (i = 0; i < args.length; i++) { + if (args[i].equals("-h")) { + printUsage(); + return; + } else if (args[i].equals("-s") && i + 1 < args.length) { + systemDictPath = args[++i]; + } else if (args[i].equals("--posMode") && i + 1 < args.length) { + posMode = POSMode.valueOf(args[++i]); + } else if (args[i].equals("--wordRefMode") && i + 1 < args.length) { + wordRefMode = WordRefMode.valueOf(args[++i]); + } else { + break; } + } + if (i >= args.length) { + System.console().printf("target dictionary file is missing"); + return; + } - if (i < args.length) { - printDictionary(args[i], systemDict, System.out); + String dictPath = args[i]; + BinaryDictionary systemDict = null; + try (BinaryDictionary dict = new BinaryDictionary(dictPath)) { + if (systemDictPath != null) { + systemDict = BinaryDictionary.loadSystem(systemDictPath); } + + DictionaryPrinter printer = new DictionaryPrinter(System.out, dict, systemDict, posMode, wordRefMode); + printer.printDictionary(); } finally { if (systemDict != null) { systemDict.close(); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index 6ab959e4..69ca9797 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -256,6 +256,14 @@ private short getPos(List data) { short posId = -1; short posStrId = -1; + if (strColumnExists && (!idColumnExists || !get(data, Column.POS1, false).isEmpty())) { + // if both id/parts exist, allow empty (-1) + POS pos = new POS( + // comment for line break + get(data, Column.POS1, true), get(data, Column.POS2, true), get(data, Column.POS3, true), + get(data, Column.POS4, true), get(data, Column.POS5, true), get(data, Column.POS6, true)); + posStrId = posTable.getId(pos); + } if (idColumnExists && (!strColumnExists || !get(data, Column.POS_ID, false).isEmpty())) { // if both id/parts exist, allow empty (-1) posId = getShort(data, Column.POS_ID); @@ -266,14 +274,6 @@ private short getPos(List data) { String.format("POS for id %d is not present in the table.", posId))); } } - if (strColumnExists && (!idColumnExists || !get(data, Column.POS1, false).isEmpty())) { - // if both id/parts exist, allow empty (-1) - POS pos = new POS( - // comment for line break - get(data, Column.POS1, true), get(data, Column.POS2, true), get(data, Column.POS3, true), - get(data, Column.POS4, true), get(data, Column.POS5, true), get(data, Column.POS6, true)); - posStrId = posTable.getId(pos); - } if (idColumnExists && strColumnExists) { if (posId < 0 && posStrId < 0) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java index 9eff2797..69e59eec 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java @@ -241,6 +241,14 @@ public WordRef parse(String text) { return new Triple(headword, posId, reading); } + if (StringUtil.count(text, WORDREF_DELIMITER) == 2) { + String[] cols = text.split(String.valueOf(WORDREF_DELIMITER), 3); + String headword = Unescape.unescape(cols[0]); + short posId = Short.parseShort(cols[1]); + String reading = Unescape.unescape(cols[2]); + return new Triple(headword, posId, reading); + } + if (allowHeadword) { return new Headword(Unescape.unescape(text)); } else { diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt index 0f960308..6ce841b1 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt @@ -20,9 +20,11 @@ import com.worksap.nlp.sudachi.TestDictionary import com.worksap.nlp.sudachi.Utils import com.worksap.nlp.sudachi.dictionary.build.DicBuilder import com.worksap.nlp.sudachi.dictionary.build.MemChannel +import com.worksap.nlp.sudachi.dictionary.build.Progress import com.worksap.nlp.sudachi.res import java.io.ByteArrayOutputStream import java.io.FileOutputStream +import java.io.OutputStream import java.io.PrintStream import java.nio.file.Files import java.nio.file.Path @@ -44,12 +46,26 @@ class DictionaryPrinterTest { Utils.copyResource(tempDir, "/unk.def") } + fun printDictionary( + output: OutputStream, + filename: String, + system: BinaryDictionary? = null, + posMode: DictionaryPrinter.POSMode = DictionaryPrinter.POSMode.DEFAULT, + wordRefMode: DictionaryPrinter.WordRefMode = DictionaryPrinter.WordRefMode.DEFAULT + ) { + val ps = PrintStream(output) + val filepath = tempDir.resolve(filename).toString() + val dict = BinaryDictionary(filepath) + val printer = DictionaryPrinter(ps, dict, system, posMode, wordRefMode) + printer.setProgress(Progress.NOOP) // suppress progress + printer.printDictionary() + dict.close() + } + @Test fun printSystemDict() { - val filename = tempDir.resolve("system.dic").toString() val output = ByteArrayOutputStream() - val ps = PrintStream(output) - DictionaryPrinter.printDictionary(filename, null, ps) + printDictionary(output, "system.dic") val lines = output.toString().split(System.lineSeparator()) assertEquals(42, lines.size) // header + entries + trailing new line @@ -58,14 +74,58 @@ class DictionaryPrinterTest { lines[0]) assertEquals("た,1,1,8729,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,,,,,,,,", lines[1]) assertEquals("に,2,2,11406,助詞,接続助詞,*,*,*,*,ニ,,,,,,,,", lines[2]) + assertEquals( + "東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,\"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/都,名詞,普通名詞,一般,*,*,*,ト\",,,\"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/都,名詞,普通名詞,一般,*,*,*,ト\",,", + lines[7]) + } + + @Test + fun printSystemDictPosIdColumn() { + val output = ByteArrayOutputStream() + printDictionary(output, "system.dic", posMode = DictionaryPrinter.POSMode.ID) + val lines = output.toString().split(System.lineSeparator()) + + assertEquals(42, lines.size) // header + entries + trailing new line + assertEquals( + "SURFACE,LEFT_ID,RIGHT_ID,COST,POS_ID,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", + lines[0]) + assertEquals("た,1,1,8729,0,タ,,,,,,,,", lines[1]) + assertEquals("に,2,2,11406,1,ニ,,,,,,,,", lines[2]) + } + + @Test + fun printSystemDictBothPosColumn() { + val output = ByteArrayOutputStream() + printDictionary(output, "system.dic", posMode = DictionaryPrinter.POSMode.BOTH) + val lines = output.toString().split(System.lineSeparator()) + + assertEquals(42, lines.size) // header + entries + trailing new line + assertEquals( + "SURFACE,LEFT_ID,RIGHT_ID,COST,POS_ID,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", + lines[0]) + assertEquals("た,1,1,8729,0,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,,,,,,,,", lines[1]) + assertEquals("に,2,2,11406,1,助詞,接続助詞,*,*,*,*,ニ,,,,,,,,", lines[2]) + } + + @Test + fun printSystemDictPosIdRef() { + val output = ByteArrayOutputStream() + printDictionary(output, "system.dic", wordRefMode = DictionaryPrinter.WordRefMode.TRIPLE_ID) + val lines = output.toString().split(System.lineSeparator()) + + assertEquals(42, lines.size) // header + entries + trailing new line + assertEquals( + "SURFACE,LEFT_ID,RIGHT_ID,COST,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", + lines[0]) + assertEquals( + "東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,\"東京,3,トウキョウ/都,4,ト\",,,\"東京,3,トウキョウ/都,4,ト\",,", + lines[7]) } @Test fun printUserDict() { - val filename = tempDir.resolve("user.dic").toString() val output = ByteArrayOutputStream() - val ps = PrintStream(output) - DictionaryPrinter.printDictionary(filename, TestDictionary.systemDict, ps) + printDictionary(output, "user.dic", TestDictionary.systemDict) val lines = output.toString().split(System.lineSeparator()) assertEquals(6, lines.size) // header + entries + trailing new line @@ -79,21 +139,34 @@ class DictionaryPrinterTest { } @Test - fun printUserDictWithoutSystem() { - val filename = tempDir.resolve("user.dic").toString() + fun printUserDictPosIdRef() { val output = ByteArrayOutputStream() - val ps = PrintStream(output) + printDictionary( + output, + "user.dic", + TestDictionary.systemDict, + wordRefMode = DictionaryPrinter.WordRefMode.TRIPLE_ID) + val lines = output.toString().split(System.lineSeparator()) + + assertEquals(6, lines.size) // header + entries + trailing new line + assertEquals( + "SURFACE,LEFT_ID,RIGHT_ID,COST,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", + lines[0]) + assertEquals( + "東京府,6,6,2816,名詞,固有名詞,地名,一般,*,*,トウキョウフ,,,\"東京,3,トウキョウ/府,4,フ\",,,\"東京,3,トウキョウ/府,4,フ\",1/3,", + lines[3]) + } - assertFails { DictionaryPrinter.printDictionary(filename, null, ps) } + @Test + fun printUserDictWithoutSystem() { + val output = ByteArrayOutputStream() + assertFails { printDictionary(output, "user.dic", null) } } @Test fun failToPrintInvalidFile() { - val filename = tempDir.resolve("unk.def").toString() val output = ByteArrayOutputStream() - val ps = PrintStream(output) - - assertFails { DictionaryPrinter.printDictionary(filename, TestDictionary.systemDict, ps) } + assertFails { printDictionary(output, "unk.def", TestDictionary.systemDict) } } @Test @@ -102,9 +175,9 @@ class DictionaryPrinterTest { val lexfile = tempDir.resolve("system_lex.csv") val output1 = FileOutputStream(lexfile.toFile()) - val ps1 = PrintStream(output1) - DictionaryPrinter.printDictionary(dicfile, null, ps1) + printDictionary(output1, "system.dic") output1.close() + val printed = Files.readString(lexfile).split(System.lineSeparator()) val dicfile2 = tempDir.resolve("system.dic2") @@ -113,8 +186,7 @@ class DictionaryPrinterTest { reload.writeData(dicfile2) val output2 = ByteArrayOutputStream() - val ps2 = PrintStream(output2) - DictionaryPrinter.printDictionary(dicfile2.toString(), null, ps2) + printDictionary(output2, "system.dic2") val reprinted = output2.toString().split(System.lineSeparator()) assertContentEquals(printed, reprinted) @@ -126,9 +198,9 @@ class DictionaryPrinterTest { val lexfile = tempDir.resolve("user_lex.csv") val output1 = FileOutputStream(lexfile.toFile()) - val ps1 = PrintStream(output1) - DictionaryPrinter.printDictionary(dicfile, TestDictionary.systemDict, ps1) + printDictionary(output1, "user.dic", TestDictionary.systemDict) output1.close() + val printed = Files.readString(lexfile).split(System.lineSeparator()) val dicfile2 = tempDir.resolve("user.dic2") @@ -137,8 +209,7 @@ class DictionaryPrinterTest { reload.writeData(dicfile2) val output2 = ByteArrayOutputStream() - val ps2 = PrintStream(output2) - DictionaryPrinter.printDictionary(dicfile2.toString(), TestDictionary.systemDict, ps2) + printDictionary(output2, "user.dic2", TestDictionary.systemDict) val reprinted = output2.toString().split(System.lineSeparator()) assertContentEquals(printed, reprinted) diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/SystemDicTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/SystemDicTest.kt index d6ded7e4..5b4e383f 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/SystemDicTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/SystemDicTest.kt @@ -166,6 +166,17 @@ class SystemDicTest { assertContentEquals(intArrayOf(wordIds[0], wordIds[2]), wi.wordStructure) } + @Test + fun variousWordReferences() { + val dictData = MemChannel() + val bldr = DicBuilder.system().matrix(res("test.matrix")) + bldr.lexicon(javaClass.getResource("wordref.csv")).build(dictData) + + val wordIds = intArrayOf(4, 8, 12, 16, 20, 24, 28, 33) + val dic = BinaryDictionary(dictData.buffer()) + assertEquals(wordIds.size, dic.lexicon.size()) + } + @Test fun failSplitBoundsCheck() { val bldr = DicBuilder.system().matrix(res("test.matrix")) diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt index ff172787..58361e79 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt @@ -44,6 +44,14 @@ class TestDic { return this } + fun systemUrl(url: URL): TestDic { + val bldr = DicBuilder.system().matrix(matrixUrl).lexicon(url) + val ch = MemChannel() + bldr.build(ch) + this.systemDic = BinaryDictionary(ch.buffer()) + return this + } + fun user(data: String): TestDic { val bldr = DicBuilder.user().system(systemDic).lexicon(data) val ch = MemChannel() @@ -52,6 +60,14 @@ class TestDic { return this } + fun userUrl(url: URL): TestDic { + val bldr = DicBuilder.user().system(systemDic).lexicon(url) + val ch = MemChannel() + bldr.build(ch) + this.userDics.add(BinaryDictionary(ch.buffer())) + return this + } + fun load(): Dictionary { val config = Config.fromClasspath(config).systemDictionary(systemDic) userDics.forEach { config.addUserDictionary(it) } @@ -139,6 +155,20 @@ class UserDicTest { assertEquals("a,b,c,d,e,f".pos, m.partOfSpeech()) } + @Test + fun variousWordReferences() { + val dictData = MemChannel() + val dic = + TestDic() + .systemUrl(javaClass.getResource("wordref.csv")) + .userUrl(javaClass.getResource("wordref-user.csv")) + .load() + + val da = dic as DictionaryAccess + val m = da.morpheme(WordId.make(1, 8)) + assertEquals("東京府", m.surface()) + } + @Test fun failWithNonExistingWordInSystem() { val bldr = diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/wordref-user.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/wordref-user.csv new file mode 100644 index 00000000..41002a00 --- /dev/null +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/wordref-user.csv @@ -0,0 +1,3 @@ +Surface,LeftId,RightId,Cost,pos_id,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Mode,Split_A,Split_B,Split_C,WordStructure,SynonymGroups +府,2,2,2914,1,名詞,普通名詞,一般,*,*,*,フ,,,A,,,,, +東京府,2,2,2816,0,名詞,固有名詞,地名,一般,*,*,トウキョウフ,,,B,"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/府,1,フ",,,, diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/wordref.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/wordref.csv new file mode 100644 index 00000000..5c4e81d2 --- /dev/null +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/wordref.csv @@ -0,0 +1,9 @@ +Surface,LeftId,RightId,Cost,pos_id,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Mode,Split_A,Split_B,Split_C,WordStructure,SynonymGroups +京都,0,0,5293,0,名詞,固有名詞,地名,一般,*,*,キョウト,,,A,,,,, +東,1,1,4675,1,名詞,普通名詞,一般,*,*,*,ヒガシ,,,A,,,,, +東京,0,0,2816,0,名詞,固有名詞,地名,一般,*,*,トウキョウ,,,A,,,,, +トウキョウ,0,0,2816,0,名詞,固有名詞,地名,一般,*,*,トウキョウ,2,2,A,,,,, +トウキョウ,1,1,3000,2,名詞,固有名詞,一般,*,*,*,トウキョウ,,,A,,,,, +都,2,2,2914,1,名詞,普通名詞,一般,*,*,*,ト,都,,A,,,,, +東京都,0,2,5320,0,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,B,2/5,,,, +東トウキョウ,0,1,5320,2,名詞,固有名詞,一般,*,*,*,ヒガシトウキョウ,,,C,,,"東,名詞,普通名詞,一般,*,*,*,ヒガシ/トウキョウ,2,トウキョウ",, From 2f1680b49dc064f7831ce4e9bfc7574b9868360a Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 30 Aug 2024 15:54:16 +0900 Subject: [PATCH 76/94] test contents of rebuilt dictionary --- .../sudachi/dictionary/DictionaryPrinter.java | 4 +- .../dictionary/DictionaryPrinterTest.kt | 101 ++++++++++++++---- .../dictionary/DoubleArrayLexiconTest.kt | 4 +- src/test/resources/dict/lex.csv | 1 + 4 files changed, 87 insertions(+), 23 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index f688b5dc..8a7fad4f 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -336,9 +336,9 @@ static String splitToString(int[] split) { * This tool requires the system dictionary when it dumps an user dictionary. * * @param args - * the option and the input filename + * the option and the input filename * @throws IOException - * if IO + * if IO */ public static void main(String[] args) throws IOException { String systemDictPath = null; diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt index 6ce841b1..1dbcc324 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt @@ -28,12 +28,14 @@ import java.io.OutputStream import java.io.PrintStream import java.nio.file.Files import java.nio.file.Path +import java.util.Arrays import kotlin.io.path.createTempDirectory import kotlin.test.BeforeTest import kotlin.test.Test -import kotlin.test.assertContentEquals import kotlin.test.assertEquals import kotlin.test.assertFails +import kotlin.test.assertFalse +import kotlin.test.assertTrue class DictionaryPrinterTest { lateinit var tempDir: Path @@ -62,13 +64,18 @@ class DictionaryPrinterTest { dict.close() } + fun wordInfoString(lex: DoubleArrayLexicon, wordId: Int): String { + val wi = lex.getWordInfo(wordId) + return "${wordId}, ${lex.string(0, wi.getSurface())}, ${wi.getLength()}, ${wi.getPOSId()}, ${wi.getNormalizedForm()}, ${wi.getDictionaryForm()}, ${lex.string(0, wi.getReadingForm())}, ${Arrays.toString(wi.getAunitSplit())}, ${Arrays.toString(wi.getBunitSplit())}, ${Arrays.toString(wi.getCunitSplit())}, ${Arrays.toString(wi.getWordStructure())}, ${Arrays.toString(wi.getSynonymGroupIds())}, ${wi.getUserData()}" + } + @Test fun printSystemDict() { val output = ByteArrayOutputStream() printDictionary(output, "system.dic") val lines = output.toString().split(System.lineSeparator()) - assertEquals(42, lines.size) // header + entries + trailing new line + assertEquals(43, lines.size) // header + entries + trailing new line assertEquals( "SURFACE,LEFT_ID,RIGHT_ID,COST,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", lines[0]) @@ -85,7 +92,7 @@ class DictionaryPrinterTest { printDictionary(output, "system.dic", posMode = DictionaryPrinter.POSMode.ID) val lines = output.toString().split(System.lineSeparator()) - assertEquals(42, lines.size) // header + entries + trailing new line + assertEquals(43, lines.size) // header + entries + trailing new line assertEquals( "SURFACE,LEFT_ID,RIGHT_ID,COST,POS_ID,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", lines[0]) @@ -99,7 +106,7 @@ class DictionaryPrinterTest { printDictionary(output, "system.dic", posMode = DictionaryPrinter.POSMode.BOTH) val lines = output.toString().split(System.lineSeparator()) - assertEquals(42, lines.size) // header + entries + trailing new line + assertEquals(43, lines.size) // header + entries + trailing new line assertEquals( "SURFACE,LEFT_ID,RIGHT_ID,COST,POS_ID,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", lines[0]) @@ -113,7 +120,7 @@ class DictionaryPrinterTest { printDictionary(output, "system.dic", wordRefMode = DictionaryPrinter.WordRefMode.TRIPLE_ID) val lines = output.toString().split(System.lineSeparator()) - assertEquals(42, lines.size) // header + entries + trailing new line + assertEquals(43, lines.size) // header + entries + trailing new line assertEquals( "SURFACE,LEFT_ID,RIGHT_ID,COST,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", lines[0]) @@ -170,9 +177,7 @@ class DictionaryPrinterTest { } @Test - fun rebuildAndReprintSystem() { - val dicfile = tempDir.resolve("system.dic").toString() - + fun rebuildSystem() { val lexfile = tempDir.resolve("system_lex.csv") val output1 = FileOutputStream(lexfile.toFile()) printDictionary(output1, "system.dic") @@ -185,17 +190,45 @@ class DictionaryPrinterTest { DicBuilder.system().matrix(res("/dict/matrix.def")).lexicon(lexfile).build(reload) reload.writeData(dicfile2) - val output2 = ByteArrayOutputStream() - printDictionary(output2, "system.dic2") - val reprinted = output2.toString().split(System.lineSeparator()) + val original = BinaryDictionary(tempDir.resolve("system.dic").toString()) + val rebuilt = BinaryDictionary(tempDir.resolve("system.dic2").toString()) + + val headerO = original.getDictionaryHeader() + val headerR = rebuilt.getDictionaryHeader() + assertEquals(headerO.getReference(), headerR.getReference()) + assertEquals(headerO.isRuntimeCosts(), headerR.isRuntimeCosts()) + assertEquals(headerO.getNumTotalEntries(), headerR.getNumTotalEntries()) + assertEquals(headerO.getNumIndexedEntries(), headerR.getNumIndexedEntries()) + + val grammarO = original.getGrammar() + val grammarR = rebuilt.getGrammar() + val posSize = grammarO.getPartOfSpeechSize() + assertEquals(posSize, grammarR.getPartOfSpeechSize()) + for (i in 0..(posSize - 1)) { + assertEquals( + grammarO.getPartOfSpeechString(i.toShort()), grammarR.getPartOfSpeechString(i.toShort())) + } + + val lexO = original.getLexicon() + val lexR = rebuilt.getLexicon() + val wiIterO = lexO.wordIds(0) + val wiIterR = lexR.wordIds(0) - assertContentEquals(printed, reprinted) + while (wiIterO.hasNext()) { + assertTrue(wiIterR.hasNext()) + val wisO = wiIterO.next() + val wisR = wiIterR.next() + + assertEquals(wisO, wisR) + for (i in 0..(wisO.length() - 1)) { + assertEquals(wordInfoString(lexO, wisO.get(i)), wordInfoString(lexR, wisR.get(i))) + } + } + assertFalse(wiIterR.hasNext()) } @Test - fun rebuildAndReprintUser() { - val dicfile = tempDir.resolve("user.dic").toString() - + fun rebuildUser() { val lexfile = tempDir.resolve("user_lex.csv") val output1 = FileOutputStream(lexfile.toFile()) printDictionary(output1, "user.dic", TestDictionary.systemDict) @@ -208,10 +241,40 @@ class DictionaryPrinterTest { DicBuilder.user().system(TestDictionary.systemDict).lexicon(lexfile).build(reload) reload.writeData(dicfile2) - val output2 = ByteArrayOutputStream() - printDictionary(output2, "user.dic2", TestDictionary.systemDict) - val reprinted = output2.toString().split(System.lineSeparator()) + val original = BinaryDictionary(tempDir.resolve("user.dic").toString()) + val rebuilt = BinaryDictionary(tempDir.resolve("user.dic2").toString()) + + val headerO = original.getDictionaryHeader() + val headerR = rebuilt.getDictionaryHeader() + assertEquals(headerO.getReference(), headerR.getReference()) + assertEquals(headerO.isRuntimeCosts(), headerR.isRuntimeCosts()) + assertEquals(headerO.getNumTotalEntries(), headerR.getNumTotalEntries()) + assertEquals(headerO.getNumIndexedEntries(), headerR.getNumIndexedEntries()) + + val grammarO = original.getGrammar() + val grammarR = rebuilt.getGrammar() + val posSize = grammarO.getPartOfSpeechSize() + assertEquals(posSize, grammarR.getPartOfSpeechSize()) + for (i in 0..(posSize - 1)) { + assertEquals( + grammarO.getPartOfSpeechString(i.toShort()), grammarR.getPartOfSpeechString(i.toShort())) + } + + val lexO = original.getLexicon() + val lexR = rebuilt.getLexicon() + val wiIterO = lexO.wordIds(0) + val wiIterR = lexR.wordIds(0) + + while (wiIterO.hasNext()) { + assertTrue(wiIterR.hasNext()) + val wisO = wiIterO.next() + val wisR = wiIterR.next() - assertContentEquals(printed, reprinted) + assertEquals(wisO, wisR) + for (i in 0..(wisO.length() - 1)) { + assertEquals(wordInfoString(lexO, wisO.get(i)), wordInfoString(lexR, wisR.get(i))) + } + } + assertFalse(wiIterR.hasNext()) } } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt index 0576e1af..8099d2e1 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt @@ -46,7 +46,7 @@ class DoubleArrayLexiconTest { @Test fun iterWordIds() { - assertEquals(40, systemWordIds.length()) + assertEquals(41, systemWordIds.length()) for (i in 0..(systemWordIds.length() - 1)) { lexicon.getWordInfo(systemWordIds.get(i)) } @@ -136,7 +136,7 @@ class DoubleArrayLexiconTest { @Test fun size() { - assertEquals(40, lexicon.size()) + assertEquals(41, lexicon.size()) } @Test fun string() {} diff --git a/src/test/resources/dict/lex.csv b/src/test/resources/dict/lex.csv index 8dae2050..9bb4c85a 100644 --- a/src/test/resources/dict/lex.csv +++ b/src/test/resources/dict/lex.csv @@ -37,5 +37,6 @@ Surface,LeftId,RightId,Cost,Writing,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,N いっ,5,5,5122,いっ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,A,,,,, 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,-9000,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,,A,,,,, 特a,8,8,2914,特a,名詞,普通名詞,一般,*,*,*,トクエー,特a,,A,,,,, +隠し,-1,-1,0,隠し,名詞,普通名詞,一般,*,*,*,カクシ,隠し,,A,,,,, な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,,C,11,11,,, 東東京都,6,8,6320,東東京都,名詞,固有名詞,地名,一般,*,*,トウトウキョウト,東東京都,,C,,,4/4/3,, From 463617e09d02a219a92bc238179e0f0dfe665ca3 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 30 Aug 2024 16:29:03 +0900 Subject: [PATCH 77/94] update test dictionary lexicon --- .../dictionary/DictionaryPrinterTest.kt | 2 +- .../dictionary/DoubleArrayLexiconTest.kt | 13 +++ .../dictionary/build/RawLexiconReaderTest.kt | 3 +- src/test/resources/dict/lex.csv | 84 +++++++++---------- src/test/resources/dict/user.csv | 10 +-- src/test/resources/dict/user2.csv | 6 +- 6 files changed, 65 insertions(+), 53 deletions(-) diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt index 1dbcc324..e20f42f4 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt @@ -142,7 +142,7 @@ class DictionaryPrinterTest { assertEquals( "東京府,6,6,2816,名詞,固有名詞,地名,一般,*,*,トウキョウフ,,,\"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/府,名詞,普通名詞,一般,*,*,*,フ\",,,\"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/府,名詞,普通名詞,一般,*,*,*,フ\",1/3,", lines[3]) - assertEquals("すだち,6,6,2816,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,,,,,,,,", lines[4]) + assertEquals("すだち,6,6,2816,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,,,,,,,,徳島県産", lines[4]) } @Test diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt index 8099d2e1..136ec402 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt @@ -119,6 +119,19 @@ class DoubleArrayLexiconTest { assertEquals(listOf(), wi.getSynonymGroupIds().toList()) } + @Test + fun userWordInfo() { + val bytes = TestDictionary.userDict1Data.buffer() + val desc = Description.load(bytes) + val userlex = DoubleArrayLexicon.load(bytes, desc) + + // すだち + val wi = userlex.getWordInfo(18) + assertEquals("すだち", userlex.string(0, wi.getSurface())) + assertEquals(8, wi.getPOSId()) + assertEquals("徳島県産", wi.getUserData()) + } + @Test fun wordInfoLong() { // 0123456789 * 30 diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt index 3b0c091e..17c352f7 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt @@ -133,7 +133,7 @@ class RawLexiconReaderTest { } @Test - fun failNewPosId() { + fun failNonExistingPosId() { val text = """Surface,LeftId,RightId,Cost,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure 東京都,6,8,5320,1,トウキョウト,,,,,""" @@ -152,7 +152,6 @@ class RawLexiconReaderTest { """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,0,トウキョウト,,,,,""" val posTable = POSTable() - posTable.getId(POS("名詞", "固有名詞", "地名", "一般", "*", "*")) val reader = RawLexiconReader(csvtext(text), posTable, false) assertNotNull(reader.nextEntry()).let { e -> assertEquals(0, e.posId) } diff --git a/src/test/resources/dict/lex.csv b/src/test/resources/dict/lex.csv index 9bb4c85a..c1abba0d 100644 --- a/src/test/resources/dict/lex.csv +++ b/src/test/resources/dict/lex.csv @@ -1,42 +1,42 @@ -Surface,LeftId,RightId,Cost,Writing,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Mode,Split_A,Split_B,Split_C,WordStructure,SynonymGroups -た,1,1,8729,た,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,,A,,,,, -に,2,2,11406,に,助詞,接続助詞,*,*,*,*,ニ,に,,A,,,,, -に,3,3,4481,に,助詞,格助詞,*,*,*,*,ニ,に,,A,,,,, -京都,6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,,A,,,,,1/5 -東,7,7,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,,A,,,,, -東京,6,6,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,,A,,,,, -東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,,B,5/9,,,5/9, -行く,4,4,5105,行く,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,,A,,,,, -行っ,5,5,5122,行っ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,7,A,,,,, -都,8,8,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,,A,,,,, -アイ,7,7,4675,アイ,名詞,普通名詞,一般,*,*,*,アイ,アイ,,A,,,,, -アイウ,7,7,4675,アイウ,名詞,普通名詞,一般,*,*,*,アイウ,アイウ,,A,,,,, -アイアイウ,6,6,32766,アイウ,名詞,固有名詞,地名,一般,*,*,アイアイウ,アイアイウ,,A,,,,, -0,9,9,2478,0,名詞,数詞,*,*,*,*,ゼロ,0,,A,,,,, -1,9,9,2478,1,名詞,数詞,*,*,*,*,イチ,1,,A,,,,, -2,9,9,2478,2,名詞,数詞,*,*,*,*,ニ,2,,A,,,,, -3,9,9,2478,3,名詞,数詞,*,*,*,*,サン,3,,A,,,,, -4,9,9,2478,4,名詞,数詞,*,*,*,*,ヨン,4,,A,,,,, -5,9,9,2478,5,名詞,数詞,*,*,*,*,ゴ,5,,A,,,,, -6,9,9,2478,6,名詞,数詞,*,*,*,*,ロク,6,,A,,,,, -7,9,9,2478,7,名詞,数詞,*,*,*,*,ナナ,7,,A,,,,, -8,9,9,2478,8,名詞,数詞,*,*,*,*,ハチ,8,,A,,,,, -9,9,9,2478,9,名詞,数詞,*,*,*,*,キュウ,9,,A,,,,, -〇,9,9,2478,〇,名詞,数詞,*,*,*,*,ゼロ,〇,,A,,,,, -一,9,9,2478,一,名詞,数詞,*,*,*,*,イチ,一,,A,,,,, -二,9,9,2478,二,名詞,数詞,*,*,*,*,ニ,二,,A,,,,, -三,9,9,2478,三,名詞,数詞,*,*,*,*,サン,三,,A,,,,, -四,9,9,2478,四,名詞,数詞,*,*,*,*,ヨン,四,,A,,,,, -五,9,9,2478,五,名詞,数詞,*,*,*,*,ゴ,五,,A,,,,, -六,9,9,2478,六,名詞,数詞,*,*,*,*,ロク,六,,A,,,,, -七,9,9,2478,七,名詞,数詞,*,*,*,*,ナナ,七,,A,,,,, -八,9,9,2478,八,名詞,数詞,*,*,*,*,ハチ,八,,A,,,,, -九,9,9,2478,九,名詞,数詞,*,*,*,*,キュウ,九,,A,,,,, -六三四,6,6,0,六三四,名詞,固有名詞,地名,一般,*,*,ムサシ,六三四,,A,,,,, -いく,4,4,5105,いく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,,A,,,,, -いっ,5,5,5122,いっ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,A,,,,, -012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,-9000,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,,A,,,,, -特a,8,8,2914,特a,名詞,普通名詞,一般,*,*,*,トクエー,特a,,A,,,,, -隠し,-1,-1,0,隠し,名詞,普通名詞,一般,*,*,*,カクシ,隠し,,A,,,,, -な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,,C,11,11,,, -東東京都,6,8,6320,東東京都,名詞,固有名詞,地名,一般,*,*,トウトウキョウト,東東京都,,C,,,4/4/3,, +Surface,LeftId,RightId,Cost,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Split_A,Split_B,Split_C,WordStructure,SynonymGroups +た,1,1,8729,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,,,,,, +に,2,2,11406,助詞,接続助詞,*,*,*,*,ニ,に,,,,,, +に,3,3,4481,助詞,格助詞,*,*,*,*,ニ,に,,,,,, +京都,6,6,5293,名詞,固有名詞,地名,一般,*,*,キョウト,京都,,,,,,1/5 +東,7,7,4675,名詞,普通名詞,一般,*,*,*,ヒガシ,東,,,,,, +東京,6,6,2816,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,,,,,, +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,,5/9,,,5/9, +行く,4,4,5105,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,,,,,, +行っ,5,5,5122,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,7,,,,, +都,8,8,2914,名詞,普通名詞,一般,*,*,*,ト,都,,,,,, +アイ,7,7,4675,名詞,普通名詞,一般,*,*,*,アイ,アイ,,,,,, +アイウ,7,7,4675,名詞,普通名詞,一般,*,*,*,アイウ,アイウ,,,,,, +アイアイウ,6,6,32766,名詞,固有名詞,地名,一般,*,*,アイアイウ,アイアイウ,,,,,, +0,9,9,2478,名詞,数詞,*,*,*,*,ゼロ,0,,,,,, +1,9,9,2478,名詞,数詞,*,*,*,*,イチ,1,,,,,, +2,9,9,2478,名詞,数詞,*,*,*,*,ニ,2,,,,,, +3,9,9,2478,名詞,数詞,*,*,*,*,サン,3,,,,,, +4,9,9,2478,名詞,数詞,*,*,*,*,ヨン,4,,,,,, +5,9,9,2478,名詞,数詞,*,*,*,*,ゴ,5,,,,,, +6,9,9,2478,名詞,数詞,*,*,*,*,ロク,6,,,,,, +7,9,9,2478,名詞,数詞,*,*,*,*,ナナ,7,,,,,, +8,9,9,2478,名詞,数詞,*,*,*,*,ハチ,8,,,,,, +9,9,9,2478,名詞,数詞,*,*,*,*,キュウ,9,,,,,, +〇,9,9,2478,名詞,数詞,*,*,*,*,ゼロ,〇,,,,,, +一,9,9,2478,名詞,数詞,*,*,*,*,イチ,一,,,,,, +二,9,9,2478,名詞,数詞,*,*,*,*,ニ,二,,,,,, +三,9,9,2478,名詞,数詞,*,*,*,*,サン,三,,,,,, +四,9,9,2478,名詞,数詞,*,*,*,*,ヨン,四,,,,,, +五,9,9,2478,名詞,数詞,*,*,*,*,ゴ,五,,,,,, +六,9,9,2478,名詞,数詞,*,*,*,*,ロク,六,,,,,, +七,9,9,2478,名詞,数詞,*,*,*,*,ナナ,七,,,,,, +八,9,9,2478,名詞,数詞,*,*,*,*,ハチ,八,,,,,, +九,9,9,2478,名詞,数詞,*,*,*,*,キュウ,九,,,,,, +六三四,6,6,0,名詞,固有名詞,地名,一般,*,*,ムサシ,六三四,,,,,, +いく,4,4,5105,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,,,,,, +いっ,5,5,5122,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,,,,, +012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,-9000,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,,,,,, +特a,8,8,2914,名詞,普通名詞,一般,*,*,*,トクエー,特a,,,,,, +隠し,-1,-1,0,名詞,普通名詞,一般,*,*,*,カクシ,隠し,,,,,, +な。な,8,8,2914,名詞,普通名詞,一般,*,*,*,ナナ,な。な,,11,11,,, +東東京都,6,8,6320,名詞,固有名詞,地名,一般,*,*,トウトウキョウト,東東京都,,,,4/4/3,, diff --git a/src/test/resources/dict/user.csv b/src/test/resources/dict/user.csv index f35b2546..d835d99e 100644 --- a/src/test/resources/dict/user.csv +++ b/src/test/resources/dict/user.csv @@ -1,5 +1,5 @@ -Surface,LeftId,RightId,Cost,Writing,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Mode,Split_A,Split_B,WordStructure,SynonymGroups -ぴらる,8,8,-32768,ぴらる,名詞,普通名詞,一般,*,*,*,ピラル,ぴらる,,A,,,, -府,8,8,2914,府,名詞,普通名詞,一般,*,*,*,フ,府,,A,,,, -東京府,6,6,2816,東京府,名詞,固有名詞,地名,一般,*,*,トウキョウフ,東京府,,B,"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/府,名詞,普通名詞,一般,*,*,*,フ",,"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/府,名詞,普通名詞,一般,*,*,*,フ",1/3 -すだち,6,6,2816,すだち,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,すだち,,A,,,, +Surface,LeftId,RightId,Cost,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Split_A,Split_B,WordStructure,SynonymGroups,UserData +ぴらる,8,8,-32768,名詞,普通名詞,一般,*,*,*,ピラル,ぴらる,,,,,, +府,8,8,2914,名詞,普通名詞,一般,*,*,*,フ,府,,,,,, +東京府,6,6,2816,名詞,固有名詞,地名,一般,*,*,トウキョウフ,東京府,,"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/府,名詞,普通名詞,一般,*,*,*,フ",,"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/府,名詞,普通名詞,一般,*,*,*,フ",1/3, +すだち,6,6,2816,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,すだち,,,,,,徳島県産 diff --git a/src/test/resources/dict/user2.csv b/src/test/resources/dict/user2.csv index 52e083f4..fd0f6215 100644 --- a/src/test/resources/dict/user2.csv +++ b/src/test/resources/dict/user2.csv @@ -1,3 +1,3 @@ -Surface,LeftId,RightId,Cost,Writing,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Mode,Split_A,Split_B,WordStructure,SynonymGroups -ぴさる,8,8,-32768,ぴさる,名詞,普通名詞,一般,*,*,*,ピサル,ぴさる,,A,,,, -かぼす,6,6,2816,かぼす,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,カボス,カボス,かぼす,,A,,,, +Surface,LeftId,RightId,Cost,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Split_A,Split_B,WordStructure,SynonymGroups +ぴさる,8,8,-32768,名詞,普通名詞,一般,*,*,*,ピサル,ぴさる,,,,, +かぼす,6,6,2816,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,カボス,カボス,かぼす,,,,, From 9b72b46d4b5bdc3e43ad42916b9985f1fbec968a Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 30 Aug 2024 18:24:37 +0900 Subject: [PATCH 78/94] add new user_dict.md --- docs/user_dict.md | 294 ++++++++++++++++++++++++--------------- docs/user_dict_legacy.md | 231 ++++++++++++++++++++++++++++++ 2 files changed, 409 insertions(+), 116 deletions(-) create mode 100644 docs/user_dict_legacy.md diff --git a/docs/user_dict.md b/docs/user_dict.md index 0280ec9b..9010cb53 100644 --- a/docs/user_dict.md +++ b/docs/user_dict.md @@ -6,81 +6,107 @@ Sudachi ではユーザー辞書をもちいて、システム辞書で不足し ユーザー辞書の作成は、登録したい見出しを記述したユーザー辞書ソースを用いて行います。 ユーザー辞書ソースのフォーマットは以下の通りです。 -なお、このファイルは、CSV形式 (RFC 4180) で保存します。文字コードは、UTF-8を使用します。 - -- 0 見出し (TRIE 用) -- 1 左連接ID -- 2 右連接ID -- 3 コスト -- 4 見出し (解析結果表示用) -- 5 品詞1 -- 6 品詞2 -- 7 品詞3 -- 8 品詞4 -- 9 品詞 (活用型) -- 10 品詞 (活用形) -- 11 読み -- 12 正規化表記 -- 13 辞書形ID -- 14 分割タイプ -- 15 A単位分割情報 -- 16 B単位分割情報 -- 17 ※未使用 - -各項目について以下に説明します。 - -### 0 見出し (TRIE 用) +なお、このファイルは、CSV 形式 (RFC 4180) で保存します。文字コードは、UTF-8 を使用します。 + +- 0 Surface: 見出し表記 +- 1 Left_Id: 左連接 ID +- 2 Right_Id: 右連接 ID +- 3 Cost: コスト +- 4 Writing: 見出し (解析結果表示用) +- 5 POS1: 品詞 1 +- 6 POS2: 品詞 2 +- 7 POS3: 品詞 3 +- 8 POS4: 品詞 4 +- 9 POS5: 品詞 (活用型) +- 10 POS6: 品詞 (活用形) +- 11 Reading_Form: 読み +- 12 Normalized_Form: 正規化表記 +- 13 Dictionary_Form: 辞書形 +- 14 Mode: 分割タイプ +- 15 Split_A: A 単位分割情報 +- 16 Split_B: B 単位分割情報 +- 17 Word_Structure: 語構成 +- 18 Synonym_Groups: 同義語グループ ID 情報 +- 19 Split_C: C 単位分割情報 +- 20 User_Data: ユーザーデータ +- 21 POS_Id: 品詞 ID + +各項目については以下に説明します。 + +### ヘッダー行 + +ファイルの一行目はヘッダーを記述します。 +ヘッダー行に記述されたカラムの順序でファイル全体がパースされます。 +カラム名の記述では、"\_" の有無および大文字・小文字の違いは無視されます。 + +ヘッダー行が検出されなかった場合は、[旧辞書フォーマット](./user_dict_legacy.md)に従って処理されます。 +ただし、この場合でも上記のカラム順に従えばすべてのカラムを記述できます。 +旧フォーマットにないカラムについてはこのドキュメントを参照してください。 + +カラムのうち、Writing、Mode、Synonym_Groups、Split_C、User_Data は非必須項目です。 +また、POS_Id と POS1 - POS6 の組はいずれか一方のみが必須となります。 + +### 語参照 + +一部の項目では、辞書内の他の語への参照を記述することがあります。 +以下ではこれを語参照と呼称します。 + +語参照は対象語の「見出し表記、品詞 1-4、品詞 (活用型)、品詞 (活用形)、読み」もしくは「見出し表記、品詞 ID、読み」を "," (カンマ) で区切った文字列で記述します。 +語参照を記述するときはその項目のフィールド全体を " (ダブルクォーテーション) で囲む必要があります。 +語参照の対象語は別途記述されている必要があります。対象語がシステム辞書内にあるかユーザー辞書内にあるかは自動的に判別します。 + +### 0 Surface: 見出し表記 形態素解析に使用される見出し表記です。 -表記の長さは、255文字まで登録できます。 +表記の長さは、255 文字まで登録できます。 #### 文字正規化 -見出しは、「Sudachiの文字正規化がおこなわれた後の形」で登録してください。 +見出しは、「Sudachi の文字正規化がおこなわれた後の形」で登録してください。 -Sudachiでは、文字正規化が行われた**後に**見出しを引きます。そのため、「正規化後に現れない形」で見出しが表記されている場合、その語はどのような場合でもマッチすることがありません。例えば、「ラテン文字の大文字」で見出しを表記した場合、Sudachi内部では正規化後の「小文字」になったもので見出しを探すため、この大文字のものとマッチすることがありません。 +Sudachi では、文字正規化が行われた**後に**見出しを引きます。そのため、「正規化後に現れない形」で見出しが表記されている場合、その語はどのような場合でもマッチすることがありません。例えば、「ラテン文字の大文字」で見出しを表記した場合、Sudachi 内部では正規化後の「小文字」になったもので見出しを探すため、この大文字のものとマッチすることがありません。 -Sudachiでは、以下の文字正規化を行っています。挙動の詳細は、[Sudachiドキュメントの該当箇所](https://github.com/WorksApplications/Sudachi#%E6%96%87%E5%AD%97%E6%AD%A3%E8%A6%8F%E5%8C%96)を参照してください。 +Sudachi では、以下の文字正規化を行っています。挙動の詳細は、[Sudachi ドキュメントの該当箇所](https://github.com/WorksApplications/Sudachi#%E6%96%87%E5%AD%97%E6%AD%A3%E8%A6%8F%E5%8C%96)を参照してください。 -* 小文字化 -* NFKC をつかった Unicode 正規化 - * ただし、設定ファイル `rewrite.def` に定義される抑制、置換が優先 +- 小文字化 +- NFKC をつかった Unicode 正規化 + - ただし、設定ファイル `rewrite.def` に定義される抑制、置換が優先 ユーザー辞書の見出しへは、文字正規化は自動的には適用されません。これは、ユーザーが想定しづらい挙動を避けるためです。そのため、ユーザー辞書の作成者が文字正規化を意識して語を表記する必要があります。 -### 1 左連接ID +### 1 Left_Id: 左連接 ID -形態素解析の連接判定(左連接)に使用されるIDです。 -使用できるIDの種類は、unidic-mecab 2.1.2 の left-id.def を参照してください。 +形態素解析の連接判定(左連接)に使用される ID です。 +使用できる ID の種類は、unidic-mecab 2.1.2 の left-id.def を参照してください。 普通名詞の登録であれば、以下のいずれかを推奨 -- 5146 名詞,普通名詞,一般,*,*,*,*,*,漢 -- 5133 名詞,普通名詞,サ変可能,*,*,*,*,*,漢 +- 5146 名詞,普通名詞,一般,_,_,_,_,\*,漢 +- 5133 名詞,普通名詞,サ変可能,_,_,_,_,\*,漢 固有名詞の登録であれば、以下のいずれかを推奨 -- 4786 名詞,固有名詞,一般,*,*,*,*,*,固 -- 4789 名詞,固有名詞,人名,名,*,*,*,*,固 -- 4790 名詞,固有名詞,人名,姓,*,*,*,*,固 +- 4786 名詞,固有名詞,一般,_,_,_,_,\*,固 +- 4789 名詞,固有名詞,人名,名,_,_,_,_,固 +- 4790 名詞,固有名詞,人名,姓,_,_,_,_,固 -### 2 右連接ID +### 2 Right_Id: 右連接 ID -形態素解析の連接判定(右連接)に使用されるIDです。 -使用できるIDの種類は、unidic-mecab 2.1.2 の right-id.def を参照してください。 +形態素解析の連接判定(右連接)に使用される ID です。 +使用できる ID の種類は、unidic-mecab 2.1.2 の right-id.def を参照してください。 普通名詞の登録であれば、以下のいずれかを推奨 -- 5146 名詞,普通名詞,一般,*,*,*,*,*,漢 -- 5133 名詞,普通名詞,サ変可能,*,*,*,*,*,漢 +- 5146 名詞,普通名詞,一般,_,_,_,_,\*,漢 +- 5133 名詞,普通名詞,サ変可能,_,_,_,_,\*,漢 固有名詞の登録であれば、以下のいずれかを推奨 -- 4786 名詞,固有名詞,一般,*,*,*,*,*,固 -- 4789 名詞,固有名詞,人名,名,*,*,*,*,固 -- 4790 名詞,固有名詞,人名,姓,*,*,*,*,固 +- 4786 名詞,固有名詞,一般,_,_,_,_,\*,固 +- 4789 名詞,固有名詞,人名,名,_,_,_,_,固 +- 4790 名詞,固有名詞,人名,姓,_,_,_,_,固 -### 3 コスト +### 3 Cost: コスト 形態素解析に使用される見出し表記のコスト値です。 "-32767 ~ 32767" までの整数値で指定できます。 @@ -89,131 +115,167 @@ Sudachiでは、以下の文字正規化を行っています。挙動の詳細 名詞類の登録であれば、"5000 ~ 9000" を推奨 -### 4 見出し (解析結果表示用) +### 4 Writing: 見出し (解析結果表示用) -「0 見出し (TRIE 用)」と同じものを指定してください。 +この項目は使用されません。 -### 5 品詞1 +空文字列とするか、見出し表記と同じものを記述してください。 + +### 5 POS1: 品詞 1 システム辞書の品詞、あるいはユーザー定義の任意の品詞を記述できます。 -システム辞書で使用する品詞1の種類は、unidic-mecab 2.1.2 の 品詞体系を参照してください。 +システム辞書で使用する品詞は、[pos.csv](../src/main/resources/pos.csv) を参照してください。 -### 6 品詞2 +### 6 POS2: 品詞 2 システム辞書の品詞、あるいはユーザー定義の任意の品詞を記述できます。 -システム辞書で使用する品詞2の種類は、unidic-mecab 2.1.2 の 品詞体系を参照してください。 +システム辞書で使用する品詞は、[pos.csv](../src/main/resources/pos.csv) を参照してください。 -### 7 品詞3 +### 7 POS3: 品詞 3 システム辞書の品詞、あるいはユーザー定義の任意の品詞を記述できます。 -システム辞書で使用する品詞3の種類は、unidic-mecab 2.1.2 の 品詞体系を参照してください。 +システム辞書で使用する品詞は、[pos.csv](../src/main/resources/pos.csv) を参照してください。 -### 8 品詞4 +### 8 POS4: 品詞 4 システム辞書の品詞、あるいはユーザー定義の任意の品詞を記述できます。 -システム辞書で使用する品詞4の種類は、unidic-mecab 2.1.2 の 品詞体系を参照してください。 +システム辞書で使用する品詞は、[pos.csv](../src/main/resources/pos.csv) を参照してください。 -### 9 品詞 (活用型) +### 9 POS5: 品詞 (活用型) システム辞書の品詞、あるいはユーザー定義の任意の品詞を記述できます。 -システム辞書で使用する品詞 (活用型)の種類は、unidic-mecab 2.1.2 の 品詞体系を参照してください。 +システム辞書で使用する品詞は、[pos.csv](../src/main/resources/pos.csv) を参照してください。 -### 10 品詞 (活用形) +### 10 POS6: 品詞 (活用形) システム辞書の品詞、あるいはユーザー定義の任意の品詞を記述できます。 -システム辞書で使用する品詞 (活用形)の種類は、unidic-mecab 2.1.2 の 品詞体系を参照してください。 +システム辞書で使用する品詞は、[pos.csv](../src/main/resources/pos.csv) を参照してください。 + +### 21 POS_Id: 品詞 ID + +システム辞書の品詞、あるいはユーザー定義の任意の品詞の ID を記述できます。 +品詞 ID の値は辞書内のものに依存します。 +ユーザ定義の品詞での利用は推奨しません。 + +システム辞書で使用する品詞は、[pos.csv](../src/main/resources/pos.csv) を参照してください。 +これは ID をのぞいて unidic-mecab 2.1.2 の 品詞体系を参照しています。 + +この項目を記述する場合、POS1 - POS6 の 6 項目を省略、もしくは空欄とすることができます。 +両方を記述する場合は、両者の内容が一致している必要があります。 -### 11 読み +### 11 Reading_Form: 読み 見出し表記の読みを記述します。 全角カタカナで記述します。 省略することもできます。(その場合は、何も記述しない) -### 12 正規化表記 +### 12 Normalized_Form: 正規化形 ID -見出し表記の正規化表記を記述します。 -「見出し表記=正規化表記」の場合は、見出し表記を記述します。 +表記にぶれのある語に対して、その語の正規化形を指定するための情報です。 -### 13 辞書形ID +対象となる語への語参照もしくは見出し表記を記述します。 -活用のある語に対して、その語の辞書形(終止形表記)を指定するための情報です。 +見出し表記での記述では、対象となる語が記述されていない場合でも文字列のみを正規化形として登録します。 + +「見出し表記=正規化表記」の場合は、空文字列とすることができます。 -ユーザー辞書ソースの行数(0始まりで何行目か)がIDです。 -対象となる語の辞書形のIDを記述します。 +### 13 Dictionary_Form: 辞書形 ID -よって、辞書形IDの情報をつけた後、ファイル内の行の並びが変わるような変更(ソートや挿入など)は加えないでください。 +活用のある語に対して、その語の辞書形(終止形表記)を指定するための情報です。 -なお、活用のない語については、このフィールドは、"*"(半角アスタリスク)を記入しておいてください。 +対象となる語への語参照を記述します。 + +活用のない語については、このフィールドは空文字列としてください。 ### 14 分割タイプ -語の分割単位タイプ (A / B / C) を記述します。後述の分割情報を記述しない場合は "*" (半角アスタリスク) でもかまいません。 +この項目は使用されません。 + +語の分割単位タイプ (A / B / C) を記述します。後述の分割情報を記述しない場合は "\*" (半角アスタリスク) もしくは空文字列でもかまいません。 + +### 15 Split_A: A 単位分割情報 + +分割単位タイプ B または C の語について、A 単位に分割するための情報です。 + +構成語への語参照を "/" (半角スラッシュ) で区切って記述します。 + +なお構成語としてのみ利用される語は連接 ID に-1 を記述すると、単独の語として出現しなくなります。 + +### 16 Split_B: B 単位分割情報 + +分割単位タイプ C の語について、B 単位に分割するための情報です。 + +フォーマットは A 単位分割情報と同じです。 + +### 19 Split_C: C 単位分割情報 + +分割単位タイプ C よりも長い語句を C 単位への分割情報と共に登録する際に使用します。 -### 15 A単位分割情報 +フォーマットは A 単位分割情報と同じです。 -分割単位タイプ B または C の語について、A単位に分割するための情報です。 +この項目が登録された語句は、解析後に自動的に C 単位(A/B が指定されている場合はそちら)に分割されます。 -構成語のIDまたは構成語情報を "/" (半角スラッシュ) で区切って記述します。 +### 17 WordStructure: 語構成情報 -構成語のIDはその語が記述されている行番号 (0始まり) か、その先頭に "U" を加えた文字列です。ユーザー辞書内の語を参照するときに "U" をつけます。 +現在は未使用の参考情報です。 -構成語情報は語の見出し (解析結果表示用)、品詞1-4、品詞 (活用型)、品詞 (活用形)、読みを "," (カンマ) で区切った文字列です。 -構成語情報を記述するときは分割情報のフィールド全体を " (ダブルクォーテーション) で囲む必要があります。 -構成語情報に記述する語は別途記述されている必要があります。構成語がシステム辞書内にあるかユーザー辞書内にあるかは自動的に判別します。 +空文字列とするか、A 単位分割情報と同じ内容を記述してください。 -なお構成語としてのみ利用される語は連接IDに-1を記述すると、単独の語として出現しなくなります。 +### 18 Synonym_Groups: 同義語グループ ID -### 16 B単位分割情報 +Sudachi 同義語辞書における同義語グループ ID 情報です。 -分割単位タイプ C の語について、B単位に分割するための情報です。 +対象となる同義語グループ ID を "/" (半角スラッシュ) で区切って記述します。 -フォーマットはA単位分割情報と同じです。 +### 20: User_Data: ユーザーデータ -### 17 ※未使用 +語に対して任意の文字列を登録します。 -このフィールドは未使用です。 -"*"(半角アスタリスク)を記入しておいてください。 +表記の長さは、32767 文字まで登録できます。 ### 例 以下にユーザー辞書ソースの例を記述します。 - 舞台藝術,5146,5146,8000,舞台藝術,名詞,普通名詞,一般,*,*,*,ブタイゲイジュツ,舞台芸術,*,*,*,*,* - 舞台芸術,5146,5146,8000,舞台芸術,名詞,普通名詞,一般,*,*,*,ブタイゲイジュツ,舞台芸術,*,*,*,*,* - コンピュータ学院,4786,5146,8000,コンピュータ学院,名詞,固有名詞,一般,*,*,*,コンピュータガクイン,コンピューター学院,*,*,*,*,* - コンピューター学院,4786,5146,8000,コンピューター学院,名詞,固有名詞,一般,*,*,*,コンピューターガクイン,コンピューター学院,*,*,*,*,* - モゲラ東京,5144,4792,4561,モゲラ東京,名詞,固有名詞,一般,*,*,*,モゲラトウキョウ,モゲラ東京,*,B,"U5/東京,名詞,固有名詞,地名,一般,*,*,トウキョウ",*,* - モゲラ,-1,-1,0,モゲラ,名詞,固有名詞,一般,*,*,*,モゲラ,モゲラ,*,*,*,*,* - 回っ,1431,1431,12016,回っ,動詞,一般,*,*,五段-ラ行,連用形-促音便,マワッ,回っ,11,*,*,*,* - 回ら,1408,1408,13113,回ら,動詞,一般,*,*,五段-ラ行,未然形-一般,マワラ,回ら,11,*,*,*,* - 回り,1428,1428,10995,回り,動詞,一般,*,*,五段-ラ行,連用形-一般,マワリ,回り,11,*,*,*,* - 回りゃ,1399,1399,12918,回りゃ,動詞,一般,*,*,五段-ラ行,仮定形-融合,マワリャ,回りゃ,11,*,*,*,* - 回りゃ,1437,1437,13113,回りゃ,動詞,一般,*,*,五段-ラ行,連用形-融合,マワリャ,回りゃ,11,*,*,*,* - 回る,1414,1414,11824,回る,動詞,一般,*,*,五段-ラ行,終止形-一般,マワル,回る,11,*,*,*,* - 回る,1421,1421,11979,回る,動詞,一般,*,*,五段-ラ行,連体形-一般,マワル,回る,11,*,*,*,* - 回れ,1396,1396,15651,回れ,動詞,一般,*,*,五段-ラ行,仮定形-一般,マワレ,回れ,11,*,*,*,* - 回れ,1402,1402,13180,回れ,動詞,一般,*,*,五段-ラ行,命令形,マワレ,回れ,11,*,*,*,* - 回ろ,1402,1402,13180,回ろ,動詞,一般,*,*,五段-ラ行,命令形,マワロ,回ろ,11,*,*,*,* - 回ろ,1405,1405,12745,回ろ,動詞,一般,*,*,五段-ラ行,意志推量形,マワロ,回ろ,11,*,*,*,* - 回ろう,1405,1405,12745,回ろう,動詞,一般,*,*,五段-ラ行,意志推量形,マワロウ,回ろう,11,*,*,*,* - 回ろっ,1405,1405,12745,回ろっ,動詞,一般,*,*,五段-ラ行,意志推量形,マワロッ,回ろっ,11,*,*,*,* - 回ん,1411,1411,13812,回ん,動詞,一般,*,*,五段-ラ行,未然形-撥音便,マワン,回ん,11,*,*,*,* - 回ん,1417,1417,14370,回ん,動詞,一般,*,*,五段-ラ行,終止形-撥音便,マワン,回ん,11,*,*,*,* - 回ん,1424,1424,14139,回ん,動詞,一般,*,*,五段-ラ行,連体形-撥音便,マワン,回ん,11,*,*,*,* - 回ん,1434,1434,13641,回ん,動詞,一般,*,*,五段-ラ行,連用形-撥音便,マワン,回ん,11,*,*,*,* +```csv +Surface,LeftId,RightId,Cost,POS1,POS2,POS3,POS4,POS5,POS6,ReadingForm,NormalizedForm,DictionaryForm,Split_A,Split_B,WordStructure +舞台藝術,5146,5146,8000,名詞,普通名詞,一般,*,*,*,ブタイゲイジュツ,,,,, +舞台芸術,5146,5146,8000,名詞,普通名詞,一般,*,*,*,ブタイゲイジュツ,,,,, +コンピュータ学院,4786,5146,8000,名詞,固有名詞,一般,*,*,*,コンピュータガクイン,コンピューター学院,,,, +コンピューター学院,4786,5146,8000,名詞,固有名詞,一般,*,*,*,コンピューターガクイン,,,,, +モゲラ東京,5144,4792,4561,名詞,固有名詞,一般,*,*,*,モゲラトウキョウ,,,"モゲラ,名詞,固有名詞,一般,*,*,*,モゲラ/東京,名詞,固有名詞,地名,一般,*,*,トウキョウ",, +モゲラ,-1,-1,0,名詞,固有名詞,一般,*,*,*,モゲラ,,,,, +回っ,1431,1431,12016,動詞,一般,*,*,五段-ラ行,連用形-促音便,マワッ,回っ,"回る,動詞,一般,*,*,五段-ラ行,終止形-一般,マワル",,, +回ら,1408,1408,13113,動詞,一般,*,*,五段-ラ行,未然形-一般,マワラ,回ら,"回る,動詞,一般,*,*,五段-ラ行,終止形-一般,マワル",,, +回り,1428,1428,10995,動詞,一般,*,*,五段-ラ行,連用形-一般,マワリ,回り,"回る,動詞,一般,*,*,五段-ラ行,終止形-一般,マワル",,, +回りゃ,1399,1399,12918,動詞,一般,*,*,五段-ラ行,仮定形-融合,マワリャ,回りゃ,"回る,動詞,一般,*,*,五段-ラ行,終止形-一般,マワル",,, +回りゃ,1437,1437,13113,動詞,一般,*,*,五段-ラ行,連用形-融合,マワリャ,回りゃ,"回る,動詞,一般,*,*,五段-ラ行,終止形-一般,マワル",,, +回る,1414,1414,11824,動詞,一般,*,*,五段-ラ行,終止形-一般,マワル,回る,"回る,動詞,一般,*,*,五段-ラ行,終止形-一般,マワル",,, +回る,1421,1421,11979,動詞,一般,*,*,五段-ラ行,連体形-一般,マワル,回る,"回る,動詞,一般,*,*,五段-ラ行,終止形-一般,マワル",,, +回れ,1396,1396,15651,動詞,一般,*,*,五段-ラ行,仮定形-一般,マワレ,回れ,"回る,動詞,一般,*,*,五段-ラ行,終止形-一般,マワル",,, +回れ,1402,1402,13180,動詞,一般,*,*,五段-ラ行,命令形,マワレ,回れ,"回る,動詞,一般,*,*,五段-ラ行,終止形-一般,マワル",,, +回ろ,1402,1402,13180,動詞,一般,*,*,五段-ラ行,命令形,マワロ,回ろ,"回る,動詞,一般,*,*,五段-ラ行,終止形-一般,マワル",,, +回ろ,1405,1405,12745,動詞,一般,*,*,五段-ラ行,意志推量形,マワロ,回ろ,"回る,動詞,一般,*,*,五段-ラ行,終止形-一般,マワル",,, +回ろう,1405,1405,12745,動詞,一般,*,*,五段-ラ行,意志推量形,マワロウ,回ろう,"回る,動詞,一般,*,*,五段-ラ行,終止形-一般,マワル",,, +回ろっ,1405,1405,12745,動詞,一般,*,*,五段-ラ行,意志推量形,マワロッ,回ろっ,"回る,動詞,一般,*,*,五段-ラ行,終止形-一般,マワル",,, +回ん,1411,1411,13812,動詞,一般,*,*,五段-ラ行,未然形-撥音便,マワン,回ん,"回る,動詞,一般,*,*,五段-ラ行,終止形-一般,マワル",,, +回ん,1417,1417,14370,動詞,一般,*,*,五段-ラ行,終止形-撥音便,マワン,回ん,"回る,動詞,一般,*,*,五段-ラ行,終止形-一般,マワル",,, +回ん,1424,1424,14139,動詞,一般,*,*,五段-ラ行,連体形-撥音便,マワン,回ん,"回る,動詞,一般,*,*,五段-ラ行,終止形-一般,マワル",,, +回ん,1434,1434,13641,動詞,一般,*,*,五段-ラ行,連用形-撥音便,マワン,回ん,"回る,動詞,一般,*,*,五段-ラ行,終止形-一般,マワル",,, +``` ## バイナリ辞書の作成 ユーザー辞書ソースファイルからバイナリ辞書ファイルを作成します。 -`$ java -Dfile.encoding=UTF-8 -cp sudachi-XX.jar com.worksap.nlp.sudachi.dictionary.UserDictionaryBuilder -o output.dic -s system_core.dic [-d comment] input.csv` +`$ java -Dfile.encoding=UTF-8 -cp sudachi-XX.jar com.worksap.nlp.sudachi.dictionary.UserDictionaryBuilder -o output.dic -s system_core.dic [-d description] input.csv` ### 引数 -- output.dic 出力するバイナリ辞書ファイル名 -- system_core.dic Sudachi のシステム辞書 -- comment バイナリ辞書のヘッダーに埋め込むコメント -- input.csv ユーザ辞書ソースファイル名 +- output.dic: 出力するバイナリ辞書ファイル名 +- system_core.dic: Sudachi のシステム辞書 +- description: バイナリ辞書のヘッダーに埋め込むコメント +- input.csv: ユーザ辞書ソースファイル名 ## ユーザー辞書の利用法 @@ -228,4 +290,4 @@ Sudachi コマンドラインツールでユーザ辞書設定を指定する場 ### ユーザー辞書の優先順位 Sudachi は基本的にユーザー辞書のエントリーをシステム辞書のエントリーより優先します。 -また、複数のユーザー辞書があるとき、後ろにあればあるほど優先順位が高いです。 \ No newline at end of file +また、複数のユーザー辞書があるとき、後ろにあるものほど優先順位が高くなります。 diff --git a/docs/user_dict_legacy.md b/docs/user_dict_legacy.md new file mode 100644 index 00000000..3d881349 --- /dev/null +++ b/docs/user_dict_legacy.md @@ -0,0 +1,231 @@ +# Sudachi ユーザー辞書作成方法 + +Sudachi ではユーザー辞書をもちいて、システム辞書で不足している語を補ったり、システム辞書の語と同一見出しの語を登録することによって解析結果上の品詞などの情報を差し替えることができます。 + +## ユーザー辞書ソースのフォーマット + +ユーザー辞書の作成は、登録したい見出しを記述したユーザー辞書ソースを用いて行います。 +ユーザー辞書ソースのフォーマットは以下の通りです。 +なお、このファイルは、CSV 形式 (RFC 4180) で保存します。文字コードは、UTF-8 を使用します。 + +- 0 見出し (TRIE 用) +- 1 左連接 ID +- 2 右連接 ID +- 3 コスト +- 4 見出し (解析結果表示用) +- 5 品詞 1 +- 6 品詞 2 +- 7 品詞 3 +- 8 品詞 4 +- 9 品詞 (活用型) +- 10 品詞 (活用形) +- 11 読み +- 12 正規化表記 +- 13 辞書形 ID +- 14 分割タイプ +- 15 A 単位分割情報 +- 16 B 単位分割情報 +- 17 ※未使用 + +各項目について以下に説明します。 + +### 0 見出し (TRIE 用) + +形態素解析に使用される見出し表記です。 +表記の長さは、255 文字まで登録できます。 + +#### 文字正規化 + +見出しは、「Sudachi の文字正規化がおこなわれた後の形」で登録してください。 + +Sudachi では、文字正規化が行われた**後に**見出しを引きます。そのため、「正規化後に現れない形」で見出しが表記されている場合、その語はどのような場合でもマッチすることがありません。例えば、「ラテン文字の大文字」で見出しを表記した場合、Sudachi 内部では正規化後の「小文字」になったもので見出しを探すため、この大文字のものとマッチすることがありません。 + +Sudachi では、以下の文字正規化を行っています。挙動の詳細は、[Sudachi ドキュメントの該当箇所](https://github.com/WorksApplications/Sudachi#%E6%96%87%E5%AD%97%E6%AD%A3%E8%A6%8F%E5%8C%96)を参照してください。 + +- 小文字化 +- NFKC をつかった Unicode 正規化 + - ただし、設定ファイル `rewrite.def` に定義される抑制、置換が優先 + +ユーザー辞書の見出しへは、文字正規化は自動的には適用されません。これは、ユーザーが想定しづらい挙動を避けるためです。そのため、ユーザー辞書の作成者が文字正規化を意識して語を表記する必要があります。 + +### 1 左連接 ID + +形態素解析の連接判定(左連接)に使用される ID です。 +使用できる ID の種類は、unidic-mecab 2.1.2 の left-id.def を参照してください。 + +普通名詞の登録であれば、以下のいずれかを推奨 + +- 5146 名詞,普通名詞,一般,_,_,_,_,\*,漢 +- 5133 名詞,普通名詞,サ変可能,_,_,_,_,\*,漢 + +固有名詞の登録であれば、以下のいずれかを推奨 + +- 4786 名詞,固有名詞,一般,_,_,_,_,\*,固 +- 4789 名詞,固有名詞,人名,名,_,_,_,_,固 +- 4790 名詞,固有名詞,人名,姓,_,_,_,_,固 + +### 2 右連接 ID + +形態素解析の連接判定(右連接)に使用される ID です。 +使用できる ID の種類は、unidic-mecab 2.1.2 の right-id.def を参照してください。 + +普通名詞の登録であれば、以下のいずれかを推奨 + +- 5146 名詞,普通名詞,一般,_,_,_,_,\*,漢 +- 5133 名詞,普通名詞,サ変可能,_,_,_,_,\*,漢 + +固有名詞の登録であれば、以下のいずれかを推奨 + +- 4786 名詞,固有名詞,一般,_,_,_,_,\*,固 +- 4789 名詞,固有名詞,人名,名,_,_,_,_,固 +- 4790 名詞,固有名詞,人名,姓,_,_,_,_,固 + +### 3 コスト + +形態素解析に使用される見出し表記のコスト値です。 +"-32767 ~ 32767" までの整数値で指定できます。 +値を小さくするほど、登録した見出し表記が解析結果として出やすくなります。 +なお、"-32768" を指定すると、ユーザー辞書読み込み時に自動推定した値を付与します。 + +名詞類の登録であれば、"5000 ~ 9000" を推奨 + +### 4 見出し (解析結果表示用) + +「0 見出し (TRIE 用)」と同じものを指定してください。 + +### 5 品詞 1 + +システム辞書の品詞、あるいはユーザー定義の任意の品詞を記述できます。 +システム辞書で使用する品詞 1 の種類は、unidic-mecab 2.1.2 の 品詞体系を参照してください。 + +### 6 品詞 2 + +システム辞書の品詞、あるいはユーザー定義の任意の品詞を記述できます。 +システム辞書で使用する品詞 2 の種類は、unidic-mecab 2.1.2 の 品詞体系を参照してください。 + +### 7 品詞 3 + +システム辞書の品詞、あるいはユーザー定義の任意の品詞を記述できます。 +システム辞書で使用する品詞 3 の種類は、unidic-mecab 2.1.2 の 品詞体系を参照してください。 + +### 8 品詞 4 + +システム辞書の品詞、あるいはユーザー定義の任意の品詞を記述できます。 +システム辞書で使用する品詞 4 の種類は、unidic-mecab 2.1.2 の 品詞体系を参照してください。 + +### 9 品詞 (活用型) + +システム辞書の品詞、あるいはユーザー定義の任意の品詞を記述できます。 +システム辞書で使用する品詞 (活用型)の種類は、unidic-mecab 2.1.2 の 品詞体系を参照してください。 + +### 10 品詞 (活用形) + +システム辞書の品詞、あるいはユーザー定義の任意の品詞を記述できます。 +システム辞書で使用する品詞 (活用形)の種類は、unidic-mecab 2.1.2 の 品詞体系を参照してください。 + +### 11 読み + +見出し表記の読みを記述します。 +全角カタカナで記述します。 +省略することもできます。(その場合は、何も記述しない) + +### 12 正規化表記 + +見出し表記の正規化表記を記述します。 +「見出し表記=正規化表記」の場合は、見出し表記を記述します。 + +### 13 辞書形 ID + +活用のある語に対して、その語の辞書形(終止形表記)を指定するための情報です。 + +ユーザー辞書ソースの行数(0 始まりで何行目か)が ID です。 +対象となる語の辞書形の ID を記述します。 + +よって、辞書形 ID の情報をつけた後、ファイル内の行の並びが変わるような変更(ソートや挿入など)は加えないでください。 + +なお、活用のない語については、このフィールドは、"\*"(半角アスタリスク)を記入しておいてください。 + +### 14 分割タイプ + +語の分割単位タイプ (A / B / C) を記述します。後述の分割情報を記述しない場合は "\*" (半角アスタリスク) でもかまいません。 + +### 15 A 単位分割情報 + +分割単位タイプ B または C の語について、A 単位に分割するための情報です。 + +構成語の ID または構成語情報を "/" (半角スラッシュ) で区切って記述します。 + +構成語の ID はその語が記述されている行番号 (0 始まり) か、その先頭に "U" を加えた文字列です。ユーザー辞書内の語を参照するときに "U" をつけます。 + +構成語情報は語の見出し (解析結果表示用)、品詞 1-4、品詞 (活用型)、品詞 (活用形)、読みを "," (カンマ) で区切った文字列です。 +構成語情報を記述するときは分割情報のフィールド全体を " (ダブルクォーテーション) で囲む必要があります。 +構成語情報に記述する語は別途記述されている必要があります。構成語がシステム辞書内にあるかユーザー辞書内にあるかは自動的に判別します。 + +なお構成語としてのみ利用される語は連接 ID に-1 を記述すると、単独の語として出現しなくなります。 + +### 16 B 単位分割情報 + +分割単位タイプ C の語について、B 単位に分割するための情報です。 + +フォーマットは A 単位分割情報と同じです。 + +### 17 ※未使用 + +このフィールドは未使用です。 +"\*"(半角アスタリスク)を記入しておいてください。 + +### 例 + +以下にユーザー辞書ソースの例を記述します。 + + 舞台藝術,5146,5146,8000,舞台藝術,名詞,普通名詞,一般,*,*,*,ブタイゲイジュツ,舞台芸術,*,*,*,*,* + 舞台芸術,5146,5146,8000,舞台芸術,名詞,普通名詞,一般,*,*,*,ブタイゲイジュツ,舞台芸術,*,*,*,*,* + コンピュータ学院,4786,5146,8000,コンピュータ学院,名詞,固有名詞,一般,*,*,*,コンピュータガクイン,コンピューター学院,*,*,*,*,* + コンピューター学院,4786,5146,8000,コンピューター学院,名詞,固有名詞,一般,*,*,*,コンピューターガクイン,コンピューター学院,*,*,*,*,* + モゲラ東京,5144,4792,4561,モゲラ東京,名詞,固有名詞,一般,*,*,*,モゲラトウキョウ,モゲラ東京,*,B,"U5/東京,名詞,固有名詞,地名,一般,*,*,トウキョウ",*,* + モゲラ,-1,-1,0,モゲラ,名詞,固有名詞,一般,*,*,*,モゲラ,モゲラ,*,*,*,*,* + 回っ,1431,1431,12016,回っ,動詞,一般,*,*,五段-ラ行,連用形-促音便,マワッ,回っ,11,*,*,*,* + 回ら,1408,1408,13113,回ら,動詞,一般,*,*,五段-ラ行,未然形-一般,マワラ,回ら,11,*,*,*,* + 回り,1428,1428,10995,回り,動詞,一般,*,*,五段-ラ行,連用形-一般,マワリ,回り,11,*,*,*,* + 回りゃ,1399,1399,12918,回りゃ,動詞,一般,*,*,五段-ラ行,仮定形-融合,マワリャ,回りゃ,11,*,*,*,* + 回りゃ,1437,1437,13113,回りゃ,動詞,一般,*,*,五段-ラ行,連用形-融合,マワリャ,回りゃ,11,*,*,*,* + 回る,1414,1414,11824,回る,動詞,一般,*,*,五段-ラ行,終止形-一般,マワル,回る,11,*,*,*,* + 回る,1421,1421,11979,回る,動詞,一般,*,*,五段-ラ行,連体形-一般,マワル,回る,11,*,*,*,* + 回れ,1396,1396,15651,回れ,動詞,一般,*,*,五段-ラ行,仮定形-一般,マワレ,回れ,11,*,*,*,* + 回れ,1402,1402,13180,回れ,動詞,一般,*,*,五段-ラ行,命令形,マワレ,回れ,11,*,*,*,* + 回ろ,1402,1402,13180,回ろ,動詞,一般,*,*,五段-ラ行,命令形,マワロ,回ろ,11,*,*,*,* + 回ろ,1405,1405,12745,回ろ,動詞,一般,*,*,五段-ラ行,意志推量形,マワロ,回ろ,11,*,*,*,* + 回ろう,1405,1405,12745,回ろう,動詞,一般,*,*,五段-ラ行,意志推量形,マワロウ,回ろう,11,*,*,*,* + 回ろっ,1405,1405,12745,回ろっ,動詞,一般,*,*,五段-ラ行,意志推量形,マワロッ,回ろっ,11,*,*,*,* + 回ん,1411,1411,13812,回ん,動詞,一般,*,*,五段-ラ行,未然形-撥音便,マワン,回ん,11,*,*,*,* + 回ん,1417,1417,14370,回ん,動詞,一般,*,*,五段-ラ行,終止形-撥音便,マワン,回ん,11,*,*,*,* + 回ん,1424,1424,14139,回ん,動詞,一般,*,*,五段-ラ行,連体形-撥音便,マワン,回ん,11,*,*,*,* + 回ん,1434,1434,13641,回ん,動詞,一般,*,*,五段-ラ行,連用形-撥音便,マワン,回ん,11,*,*,*,* + +## バイナリ辞書の作成 + +ユーザー辞書ソースファイルからバイナリ辞書ファイルを作成します。 + +`$ java -Dfile.encoding=UTF-8 -cp sudachi-XX.jar com.worksap.nlp.sudachi.dictionary.UserDictionaryBuilder -o output.dic -s system_core.dic [-d comment] input.csv` + +### 引数 + +- output.dic 出力するバイナリ辞書ファイル名 +- system_core.dic Sudachi のシステム辞書 +- comment バイナリ辞書のヘッダーに埋め込むコメント +- input.csv ユーザ辞書ソースファイル名 + +## ユーザー辞書の利用法 + +設定の JSON に以下のような項目を追加します。ユーザー辞書は複数指定することができます。 + +`"userDict" : [ "user1.dic", "user2.dic" ]` + +Sudachi コマンドラインツールでユーザ辞書設定を指定する場合は、以下のように実行します。 + +`$ java -jar sudachi-XX.jar -s '{"userDict":["user1.dic","user2.dic"]}'` + +### ユーザー辞書の優先順位 + +Sudachi は基本的にユーザー辞書のエントリーをシステム辞書のエントリーより優先します。 +また、複数のユーザー辞書があるとき、後ろにあればあるほど優先順位が高いです。 From 9bbde2372763a335882c68549f82a6f6471b914a Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 2 Sep 2024 09:06:27 +0900 Subject: [PATCH 79/94] add pos.csv --- src/main/resources/pos.csv | 1559 ++++++++++++++++++++++++++++++++++++ 1 file changed, 1559 insertions(+) create mode 100644 src/main/resources/pos.csv diff --git a/src/main/resources/pos.csv b/src/main/resources/pos.csv new file mode 100644 index 00000000..b1e8ee3a --- /dev/null +++ b/src/main/resources/pos.csv @@ -0,0 +1,1559 @@ +POS_ID,POS1,POS2,POS3,POS4,POS5,POS6 +0,代名詞,*,*,*,*,* +1,副詞,*,*,*,*,* +2,助動詞,*,*,*,下一段-ア行,仮定形-一般 +3,助動詞,*,*,*,下一段-ア行,仮定形-融合 +4,助動詞,*,*,*,下一段-ア行,命令形 +5,助動詞,*,*,*,下一段-ア行,意志推量形 +6,助動詞,*,*,*,下一段-ア行,未然形-一般 +7,助動詞,*,*,*,下一段-ア行,終止形-一般 +8,助動詞,*,*,*,下一段-ア行,終止形-撥音便 +9,助動詞,*,*,*,下一段-ア行,連体形-一般 +10,助動詞,*,*,*,下一段-ア行,連体形-撥音便 +11,助動詞,*,*,*,下一段-ア行,連用形-一般 +12,助動詞,*,*,*,下一段-カ行,仮定形-一般 +13,助動詞,*,*,*,下一段-カ行,仮定形-融合 +14,助動詞,*,*,*,下一段-カ行,命令形 +15,助動詞,*,*,*,下一段-カ行,意志推量形 +16,助動詞,*,*,*,下一段-カ行,未然形-一般 +17,助動詞,*,*,*,下一段-カ行,終止形-一般 +18,助動詞,*,*,*,下一段-カ行,終止形-撥音便 +19,助動詞,*,*,*,下一段-カ行,連体形-一般 +20,助動詞,*,*,*,下一段-カ行,連体形-撥音便 +21,助動詞,*,*,*,下一段-カ行,連用形-一般 +22,助動詞,*,*,*,下一段-ガ行,仮定形-一般 +23,助動詞,*,*,*,下一段-ガ行,仮定形-融合 +24,助動詞,*,*,*,下一段-ガ行,命令形 +25,助動詞,*,*,*,下一段-ガ行,意志推量形 +26,助動詞,*,*,*,下一段-ガ行,未然形-一般 +27,助動詞,*,*,*,下一段-ガ行,終止形-一般 +28,助動詞,*,*,*,下一段-ガ行,終止形-撥音便 +29,助動詞,*,*,*,下一段-ガ行,連体形-一般 +30,助動詞,*,*,*,下一段-ガ行,連体形-撥音便 +31,助動詞,*,*,*,下一段-ガ行,連用形-一般 +32,助動詞,*,*,*,下一段-サ行,仮定形-一般 +33,助動詞,*,*,*,下一段-サ行,仮定形-融合 +34,助動詞,*,*,*,下一段-サ行,命令形 +35,助動詞,*,*,*,下一段-サ行,意志推量形 +36,助動詞,*,*,*,下一段-サ行,未然形-一般 +37,助動詞,*,*,*,下一段-サ行,終止形-一般 +38,助動詞,*,*,*,下一段-サ行,終止形-撥音便 +39,助動詞,*,*,*,下一段-サ行,連体形-一般 +40,助動詞,*,*,*,下一段-サ行,連体形-撥音便 +41,助動詞,*,*,*,下一段-サ行,連用形-一般 +42,助動詞,*,*,*,下一段-タ行,仮定形-一般 +43,助動詞,*,*,*,下一段-タ行,仮定形-融合 +44,助動詞,*,*,*,下一段-タ行,命令形 +45,助動詞,*,*,*,下一段-タ行,意志推量形 +46,助動詞,*,*,*,下一段-タ行,未然形-一般 +47,助動詞,*,*,*,下一段-タ行,終止形-一般 +48,助動詞,*,*,*,下一段-タ行,終止形-撥音便 +49,助動詞,*,*,*,下一段-タ行,連体形-一般 +50,助動詞,*,*,*,下一段-タ行,連体形-撥音便 +51,助動詞,*,*,*,下一段-タ行,連体形-省略 +52,助動詞,*,*,*,下一段-タ行,連用形-一般 +53,助動詞,*,*,*,下一段-ダ行,仮定形-一般 +54,助動詞,*,*,*,下一段-ダ行,仮定形-融合 +55,助動詞,*,*,*,下一段-ダ行,命令形 +56,助動詞,*,*,*,下一段-ダ行,意志推量形 +57,助動詞,*,*,*,下一段-ダ行,未然形-一般 +58,助動詞,*,*,*,下一段-ダ行,終止形-一般 +59,助動詞,*,*,*,下一段-ダ行,終止形-撥音便 +60,助動詞,*,*,*,下一段-ダ行,連体形-一般 +61,助動詞,*,*,*,下一段-ダ行,連体形-撥音便 +62,助動詞,*,*,*,下一段-ダ行,連体形-省略 +63,助動詞,*,*,*,下一段-ダ行,連用形-一般 +64,助動詞,*,*,*,下一段-マ行,仮定形-一般 +65,助動詞,*,*,*,下一段-マ行,仮定形-融合 +66,助動詞,*,*,*,下一段-マ行,命令形 +67,助動詞,*,*,*,下一段-マ行,意志推量形 +68,助動詞,*,*,*,下一段-マ行,未然形-一般 +69,助動詞,*,*,*,下一段-マ行,終止形-一般 +70,助動詞,*,*,*,下一段-マ行,終止形-撥音便 +71,助動詞,*,*,*,下一段-マ行,連体形-一般 +72,助動詞,*,*,*,下一段-マ行,連体形-撥音便 +73,助動詞,*,*,*,下一段-マ行,連用形-一般 +74,助動詞,*,*,*,下一段-ラ行,仮定形-一般 +75,助動詞,*,*,*,下一段-ラ行,仮定形-融合 +76,助動詞,*,*,*,下一段-ラ行,命令形 +77,助動詞,*,*,*,下一段-ラ行,意志推量形 +78,助動詞,*,*,*,下一段-ラ行,未然形-一般 +79,助動詞,*,*,*,下一段-ラ行,終止形-一般 +80,助動詞,*,*,*,下一段-ラ行,終止形-撥音便 +81,助動詞,*,*,*,下一段-ラ行,連体形-一般 +82,助動詞,*,*,*,下一段-ラ行,連体形-撥音便 +83,助動詞,*,*,*,下一段-ラ行,連用形-一般 +84,助動詞,*,*,*,五段-カ行,仮定形-一般 +85,助動詞,*,*,*,五段-カ行,仮定形-融合 +86,助動詞,*,*,*,五段-カ行,命令形 +87,助動詞,*,*,*,五段-カ行,意志推量形 +88,助動詞,*,*,*,五段-カ行,未然形-一般 +89,助動詞,*,*,*,五段-カ行,終止形-一般 +90,助動詞,*,*,*,五段-カ行,連体形-一般 +91,助動詞,*,*,*,五段-カ行,連用形-イ音便 +92,助動詞,*,*,*,五段-カ行,連用形-一般 +93,助動詞,*,*,*,五段-カ行,連用形-促音便 +94,助動詞,*,*,*,五段-カ行,連用形-融合 +95,助動詞,*,*,*,五段-サ行,仮定形-一般 +96,助動詞,*,*,*,五段-サ行,仮定形-融合 +97,助動詞,*,*,*,五段-サ行,命令形 +98,助動詞,*,*,*,五段-サ行,意志推量形 +99,助動詞,*,*,*,五段-サ行,未然形-一般 +100,助動詞,*,*,*,五段-サ行,終止形-一般 +101,助動詞,*,*,*,五段-サ行,連体形-一般 +102,助動詞,*,*,*,五段-サ行,連用形-一般 +103,助動詞,*,*,*,五段-サ行,連用形-融合 +104,助動詞,*,*,*,五段-ラ行,仮定形-一般 +105,助動詞,*,*,*,五段-ラ行,仮定形-融合 +106,助動詞,*,*,*,五段-ラ行,命令形 +107,助動詞,*,*,*,五段-ラ行,意志推量形 +108,助動詞,*,*,*,五段-ラ行,未然形-一般 +109,助動詞,*,*,*,五段-ラ行,未然形-撥音便 +110,助動詞,*,*,*,五段-ラ行,終止形-一般 +111,助動詞,*,*,*,五段-ラ行,終止形-促音便 +112,助動詞,*,*,*,五段-ラ行,終止形-撥音便 +113,助動詞,*,*,*,五段-ラ行,連体形-一般 +114,助動詞,*,*,*,五段-ラ行,連体形-撥音便 +115,助動詞,*,*,*,五段-ラ行,連体形-省略 +116,助動詞,*,*,*,五段-ラ行,連用形-イ音便 +117,助動詞,*,*,*,五段-ラ行,連用形-一般 +118,助動詞,*,*,*,五段-ラ行,連用形-促音便 +119,助動詞,*,*,*,五段-ラ行,連用形-撥音便 +120,助動詞,*,*,*,五段-ラ行,連用形-融合 +121,助動詞,*,*,*,五段-ワア行,仮定形-一般 +122,助動詞,*,*,*,五段-ワア行,命令形 +123,助動詞,*,*,*,五段-ワア行,意志推量形 +124,助動詞,*,*,*,五段-ワア行,未然形-一般 +125,助動詞,*,*,*,五段-ワア行,終止形-一般 +126,助動詞,*,*,*,五段-ワア行,連体形-一般 +127,助動詞,*,*,*,五段-ワア行,連用形-ウ音便 +128,助動詞,*,*,*,五段-ワア行,連用形-一般 +129,助動詞,*,*,*,五段-ワア行,連用形-促音便 +130,助動詞,*,*,*,助動詞-ジャ,意志推量形 +131,助動詞,*,*,*,助動詞-ジャ,終止形-一般 +132,助動詞,*,*,*,助動詞-ジャ,連用形-一般 +133,助動詞,*,*,*,助動詞-タ,仮定形-一般 +134,助動詞,*,*,*,助動詞-タ,意志推量形 +135,助動詞,*,*,*,助動詞-タ,未然形-一般 +136,助動詞,*,*,*,助動詞-タ,終止形-一般 +137,助動詞,*,*,*,助動詞-タ,連体形-一般 +138,助動詞,*,*,*,助動詞-タイ,仮定形-一般 +139,助動詞,*,*,*,助動詞-タイ,仮定形-融合 +140,助動詞,*,*,*,助動詞-タイ,意志推量形 +141,助動詞,*,*,*,助動詞-タイ,未然形-一般 +142,助動詞,*,*,*,助動詞-タイ,終止形-一般 +143,助動詞,*,*,*,助動詞-タイ,語幹-一般 +144,助動詞,*,*,*,助動詞-タイ,連体形-一般 +145,助動詞,*,*,*,助動詞-タイ,連用形-ウ音便 +146,助動詞,*,*,*,助動詞-タイ,連用形-一般 +147,助動詞,*,*,*,助動詞-タイ,連用形-促音便 +148,助動詞,*,*,*,助動詞-タイ,連用形-融合 +149,助動詞,*,*,*,助動詞-ダ,仮定形-一般 +150,助動詞,*,*,*,助動詞-ダ,意志推量形 +151,助動詞,*,*,*,助動詞-ダ,未然形-一般 +152,助動詞,*,*,*,助動詞-ダ,終止形-一般 +153,助動詞,*,*,*,助動詞-ダ,終止形-融合 +154,助動詞,*,*,*,助動詞-ダ,連体形-一般 +155,助動詞,*,*,*,助動詞-ダ,連用形-ニ +156,助動詞,*,*,*,助動詞-ダ,連用形-一般 +157,助動詞,*,*,*,助動詞-ダ,連用形-促音便 +158,助動詞,*,*,*,助動詞-ダ,連用形-撥音便 +159,助動詞,*,*,*,助動詞-ダ,連用形-融合 +160,助動詞,*,*,*,助動詞-デス,意志推量形 +161,助動詞,*,*,*,助動詞-デス,終止形-一般 +162,助動詞,*,*,*,助動詞-デス,終止形-促音便 +163,助動詞,*,*,*,助動詞-デス,終止形-撥音便 +164,助動詞,*,*,*,助動詞-デス,連体形-一般 +165,助動詞,*,*,*,助動詞-デス,連体形-撥音便 +166,助動詞,*,*,*,助動詞-デス,連用形-一般 +167,助動詞,*,*,*,助動詞-ドス,意志推量形 +168,助動詞,*,*,*,助動詞-ドス,終止形-一般 +169,助動詞,*,*,*,助動詞-ドス,連用形-一般 +170,助動詞,*,*,*,助動詞-ナイ,仮定形-一般 +171,助動詞,*,*,*,助動詞-ナイ,仮定形-融合 +172,助動詞,*,*,*,助動詞-ナイ,意志推量形 +173,助動詞,*,*,*,助動詞-ナイ,未然形-一般 +174,助動詞,*,*,*,助動詞-ナイ,終止形-一般 +175,助動詞,*,*,*,助動詞-ナイ,語幹-一般 +176,助動詞,*,*,*,助動詞-ナイ,連体形-一般 +177,助動詞,*,*,*,助動詞-ナイ,連用形-一般 +178,助動詞,*,*,*,助動詞-ナイ,連用形-促音便 +179,助動詞,*,*,*,助動詞-ナンダ,仮定形-一般 +180,助動詞,*,*,*,助動詞-ナンダ,未然形-一般 +181,助動詞,*,*,*,助動詞-ナンダ,終止形-一般 +182,助動詞,*,*,*,助動詞-ナンダ,連体形-一般 +183,助動詞,*,*,*,助動詞-ナンダ,連用形-一般 +184,助動詞,*,*,*,助動詞-ヌ,仮定形-一般 +185,助動詞,*,*,*,助動詞-ヌ,仮定形-融合 +186,助動詞,*,*,*,助動詞-ヌ,終止形-一般 +187,助動詞,*,*,*,助動詞-ヌ,終止形-撥音便 +188,助動詞,*,*,*,助動詞-ヌ,連体形-一般 +189,助動詞,*,*,*,助動詞-ヌ,連体形-撥音便 +190,助動詞,*,*,*,助動詞-ヌ,連体形-補助 +191,助動詞,*,*,*,助動詞-ヌ,連用形-一般 +192,助動詞,*,*,*,助動詞-ヘン,終止形-一般 +193,助動詞,*,*,*,助動詞-ヘン,連体形-一般 +194,助動詞,*,*,*,助動詞-ヘン,連用形-一般 +195,助動詞,*,*,*,助動詞-マイ,終止形-一般 +196,助動詞,*,*,*,助動詞-マイ,連体形-一般 +197,助動詞,*,*,*,助動詞-マス,仮定形-一般 +198,助動詞,*,*,*,助動詞-マス,命令形 +199,助動詞,*,*,*,助動詞-マス,意志推量形 +200,助動詞,*,*,*,助動詞-マス,未然形-一般 +201,助動詞,*,*,*,助動詞-マス,終止形-一般 +202,助動詞,*,*,*,助動詞-マス,終止形-促音便 +203,助動詞,*,*,*,助動詞-マス,終止形-撥音便 +204,助動詞,*,*,*,助動詞-マス,終止形-融合 +205,助動詞,*,*,*,助動詞-マス,連体形-一般 +206,助動詞,*,*,*,助動詞-マス,連体形-撥音便 +207,助動詞,*,*,*,助動詞-マス,連用形-一般 +208,助動詞,*,*,*,助動詞-ヤ,意志推量形 +209,助動詞,*,*,*,助動詞-ヤ,終止形-一般 +210,助動詞,*,*,*,助動詞-ヤ,連用形-促音便 +211,助動詞,*,*,*,助動詞-ヤス,仮定形-一般 +212,助動詞,*,*,*,助動詞-ヤス,命令形 +213,助動詞,*,*,*,助動詞-ヤス,意志推量形 +214,助動詞,*,*,*,助動詞-ヤス,未然形-一般 +215,助動詞,*,*,*,助動詞-ヤス,終止形-一般 +216,助動詞,*,*,*,助動詞-ヤス,連体形-一般 +217,助動詞,*,*,*,助動詞-ヤス,連用形-一般 +218,助動詞,*,*,*,助動詞-ラシイ,仮定形-一般 +219,助動詞,*,*,*,助動詞-ラシイ,仮定形-融合 +220,助動詞,*,*,*,助動詞-ラシイ,終止形-一般 +221,助動詞,*,*,*,助動詞-ラシイ,語幹-一般 +222,助動詞,*,*,*,助動詞-ラシイ,連体形-一般 +223,助動詞,*,*,*,助動詞-ラシイ,連用形-ウ音便 +224,助動詞,*,*,*,助動詞-ラシイ,連用形-一般 +225,助動詞,*,*,*,助動詞-ラシイ,連用形-促音便 +226,助動詞,*,*,*,助動詞-レル,仮定形-一般 +227,助動詞,*,*,*,助動詞-レル,仮定形-融合 +228,助動詞,*,*,*,助動詞-レル,命令形 +229,助動詞,*,*,*,助動詞-レル,意志推量形 +230,助動詞,*,*,*,助動詞-レル,未然形-一般 +231,助動詞,*,*,*,助動詞-レル,未然形-撥音便 +232,助動詞,*,*,*,助動詞-レル,終止形-一般 +233,助動詞,*,*,*,助動詞-レル,終止形-撥音便 +234,助動詞,*,*,*,助動詞-レル,連体形-一般 +235,助動詞,*,*,*,助動詞-レル,連用形-一般 +236,助動詞,*,*,*,文語下二段-サ行,命令形 +237,助動詞,*,*,*,文語下二段-サ行,已然形-一般 +238,助動詞,*,*,*,文語下二段-サ行,未然形-一般 +239,助動詞,*,*,*,文語下二段-サ行,終止形-一般 +240,助動詞,*,*,*,文語下二段-サ行,連体形-一般 +241,助動詞,*,*,*,文語下二段-サ行,連用形-一般 +242,助動詞,*,*,*,文語下二段-マ行,命令形 +243,助動詞,*,*,*,文語下二段-マ行,已然形-一般 +244,助動詞,*,*,*,文語下二段-マ行,未然形-一般 +245,助動詞,*,*,*,文語下二段-マ行,終止形-一般 +246,助動詞,*,*,*,文語下二段-マ行,連体形-一般 +247,助動詞,*,*,*,文語下二段-マ行,連用形-一般 +248,助動詞,*,*,*,文語下二段-ラ行,命令形 +249,助動詞,*,*,*,文語下二段-ラ行,已然形-一般 +250,助動詞,*,*,*,文語下二段-ラ行,未然形-一般 +251,助動詞,*,*,*,文語下二段-ラ行,終止形-一般 +252,助動詞,*,*,*,文語下二段-ラ行,連体形-一般 +253,助動詞,*,*,*,文語下二段-ラ行,連用形-一般 +254,助動詞,*,*,*,文語助動詞-キ,已然形-一般 +255,助動詞,*,*,*,文語助動詞-キ,未然形-一般 +256,助動詞,*,*,*,文語助動詞-キ,終止形-一般 +257,助動詞,*,*,*,文語助動詞-キ,連体形-一般 +258,助動詞,*,*,*,文語助動詞-ケム,已然形-一般 +259,助動詞,*,*,*,文語助動詞-ケム,終止形-一般 +260,助動詞,*,*,*,文語助動詞-ケム,終止形-撥音便 +261,助動詞,*,*,*,文語助動詞-ケム,連体形-一般 +262,助動詞,*,*,*,文語助動詞-ケム,連体形-撥音便 +263,助動詞,*,*,*,文語助動詞-ケリ,ク語法 +264,助動詞,*,*,*,文語助動詞-ケリ,已然形-一般 +265,助動詞,*,*,*,文語助動詞-ケリ,未然形-一般 +266,助動詞,*,*,*,文語助動詞-ケリ,終止形-一般 +267,助動詞,*,*,*,文語助動詞-ケリ,連体形-一般 +268,助動詞,*,*,*,文語助動詞-ゴトシ,終止形-一般 +269,助動詞,*,*,*,文語助動詞-ゴトシ,連体形-一般 +270,助動詞,*,*,*,文語助動詞-ゴトシ,連用形-一般 +271,助動詞,*,*,*,文語助動詞-ゴトシ,連用形-補助 +272,助動詞,*,*,*,文語助動詞-ザマス,未然形-一般 +273,助動詞,*,*,*,文語助動詞-ザマス,終止形-一般 +274,助動詞,*,*,*,文語助動詞-ザマス,連体形-一般 +275,助動詞,*,*,*,文語助動詞-ザマス,連用形-一般 +276,助動詞,*,*,*,文語助動詞-ザンス,已然形-一般 +277,助動詞,*,*,*,文語助動詞-ザンス,未然形-一般 +278,助動詞,*,*,*,文語助動詞-ザンス,終止形-一般 +279,助動詞,*,*,*,文語助動詞-ザンス,連体形-一般 +280,助動詞,*,*,*,文語助動詞-ザンス,連用形-一般 +281,助動詞,*,*,*,文語助動詞-ジ,已然形-一般 +282,助動詞,*,*,*,文語助動詞-ジ,終止形-一般 +283,助動詞,*,*,*,文語助動詞-ジ,連体形-一般 +284,助動詞,*,*,*,文語助動詞-ズ,ク語法 +285,助動詞,*,*,*,文語助動詞-ズ,命令形 +286,助動詞,*,*,*,文語助動詞-ズ,已然形-一般 +287,助動詞,*,*,*,文語助動詞-ズ,已然形-補助 +288,助動詞,*,*,*,文語助動詞-ズ,意志推量形 +289,助動詞,*,*,*,文語助動詞-ズ,未然形-一般 +290,助動詞,*,*,*,文語助動詞-ズ,未然形-補助 +291,助動詞,*,*,*,文語助動詞-ズ,終止形-一般 +292,助動詞,*,*,*,文語助動詞-ズ,連体形-一般 +293,助動詞,*,*,*,文語助動詞-ズ,連体形-撥音便 +294,助動詞,*,*,*,文語助動詞-ズ,連体形-補助 +295,助動詞,*,*,*,文語助動詞-ズ,連用形-ニ +296,助動詞,*,*,*,文語助動詞-ズ,連用形-一般 +297,助動詞,*,*,*,文語助動詞-ズ,連用形-補助 +298,助動詞,*,*,*,文語助動詞-タリ-完了,命令形 +299,助動詞,*,*,*,文語助動詞-タリ-完了,已然形-一般 +300,助動詞,*,*,*,文語助動詞-タリ-完了,意志推量形 +301,助動詞,*,*,*,文語助動詞-タリ-完了,未然形-一般 +302,助動詞,*,*,*,文語助動詞-タリ-完了,終止形-一般 +303,助動詞,*,*,*,文語助動詞-タリ-完了,連体形-一般 +304,助動詞,*,*,*,文語助動詞-タリ-完了,連体形-撥音便 +305,助動詞,*,*,*,文語助動詞-タリ-完了,連用形-一般 +306,助動詞,*,*,*,文語助動詞-タリ-断定,命令形 +307,助動詞,*,*,*,文語助動詞-タリ-断定,已然形-一般 +308,助動詞,*,*,*,文語助動詞-タリ-断定,意志推量形 +309,助動詞,*,*,*,文語助動詞-タリ-断定,未然形-一般 +310,助動詞,*,*,*,文語助動詞-タリ-断定,終止形-一般 +311,助動詞,*,*,*,文語助動詞-タリ-断定,連体形-一般 +312,助動詞,*,*,*,文語助動詞-タリ-断定,連用形-ト +313,助動詞,*,*,*,文語助動詞-タリ-断定,連用形-一般 +314,助動詞,*,*,*,文語助動詞-ツ,命令形 +315,助動詞,*,*,*,文語助動詞-ツ,已然形-一般 +316,助動詞,*,*,*,文語助動詞-ツ,未然形-一般 +317,助動詞,*,*,*,文語助動詞-ツ,終止形-一般 +318,助動詞,*,*,*,文語助動詞-ツ,連体形-一般 +319,助動詞,*,*,*,文語助動詞-ツ,連用形-一般 +320,助動詞,*,*,*,文語助動詞-ナリ-伝聞,ク語法 +321,助動詞,*,*,*,文語助動詞-ナリ-伝聞,已然形-一般 +322,助動詞,*,*,*,文語助動詞-ナリ-伝聞,意志推量形 +323,助動詞,*,*,*,文語助動詞-ナリ-伝聞,未然形-一般 +324,助動詞,*,*,*,文語助動詞-ナリ-伝聞,終止形-一般 +325,助動詞,*,*,*,文語助動詞-ナリ-伝聞,連体形-一般 +326,助動詞,*,*,*,文語助動詞-ナリ-伝聞,連用形-一般 +327,助動詞,*,*,*,文語助動詞-ナリ-断定,命令形 +328,助動詞,*,*,*,文語助動詞-ナリ-断定,已然形-一般 +329,助動詞,*,*,*,文語助動詞-ナリ-断定,意志推量形 +330,助動詞,*,*,*,文語助動詞-ナリ-断定,未然形-一般 +331,助動詞,*,*,*,文語助動詞-ナリ-断定,終止形-一般 +332,助動詞,*,*,*,文語助動詞-ナリ-断定,連体形-一般 +333,助動詞,*,*,*,文語助動詞-ナリ-断定,連体形-撥音便 +334,助動詞,*,*,*,文語助動詞-ナリ-断定,連用形-ニ +335,助動詞,*,*,*,文語助動詞-ナリ-断定,連用形-一般 +336,助動詞,*,*,*,文語助動詞-ヌ,命令形 +337,助動詞,*,*,*,文語助動詞-ヌ,已然形-一般 +338,助動詞,*,*,*,文語助動詞-ヌ,未然形-一般 +339,助動詞,*,*,*,文語助動詞-ヌ,終止形-一般 +340,助動詞,*,*,*,文語助動詞-ヌ,連体形-一般 +341,助動詞,*,*,*,文語助動詞-ヌ,連用形-一般 +342,助動詞,*,*,*,文語助動詞-ベシ,已然形-一般 +343,助動詞,*,*,*,文語助動詞-ベシ,意志推量形 +344,助動詞,*,*,*,文語助動詞-ベシ,未然形-一般 +345,助動詞,*,*,*,文語助動詞-ベシ,未然形-補助 +346,助動詞,*,*,*,文語助動詞-ベシ,終止形-一般 +347,助動詞,*,*,*,文語助動詞-ベシ,連体形-イ音便 +348,助動詞,*,*,*,文語助動詞-ベシ,連体形-一般 +349,助動詞,*,*,*,文語助動詞-ベシ,連体形-撥音便 +350,助動詞,*,*,*,文語助動詞-ベシ,連体形-補助 +351,助動詞,*,*,*,文語助動詞-ベシ,連用形-ウ音便 +352,助動詞,*,*,*,文語助動詞-ベシ,連用形-一般 +353,助動詞,*,*,*,文語助動詞-ベシ,連用形-補助 +354,助動詞,*,*,*,文語助動詞-マジ,已然形-一般 +355,助動詞,*,*,*,文語助動詞-マジ,意志推量形 +356,助動詞,*,*,*,文語助動詞-マジ,未然形-一般 +357,助動詞,*,*,*,文語助動詞-マジ,未然形-補助 +358,助動詞,*,*,*,文語助動詞-マジ,終止形-一般 +359,助動詞,*,*,*,文語助動詞-マジ,連体形-一般 +360,助動詞,*,*,*,文語助動詞-マジ,連体形-撥音便 +361,助動詞,*,*,*,文語助動詞-マジ,連体形-補助 +362,助動詞,*,*,*,文語助動詞-マジ,連用形-ウ音便 +363,助動詞,*,*,*,文語助動詞-マジ,連用形-一般 +364,助動詞,*,*,*,文語助動詞-マジ,連用形-補助 +365,助動詞,*,*,*,文語助動詞-ム,ク語法 +366,助動詞,*,*,*,文語助動詞-ム,已然形-一般 +367,助動詞,*,*,*,文語助動詞-ム,未然形-一般 +368,助動詞,*,*,*,文語助動詞-ム,終止形-一般 +369,助動詞,*,*,*,文語助動詞-ム,終止形-撥音便 +370,助動詞,*,*,*,文語助動詞-ム,連体形-一般 +371,助動詞,*,*,*,文語助動詞-ム,連体形-撥音便 +372,助動詞,*,*,*,文語助動詞-ラシ,意志推量形 +373,助動詞,*,*,*,文語助動詞-ラシ,未然形-補助 +374,助動詞,*,*,*,文語助動詞-ラシ,終止形-一般 +375,助動詞,*,*,*,文語助動詞-ラシ,連体形-一般 +376,助動詞,*,*,*,文語助動詞-ラシ,連体形-補助 +377,助動詞,*,*,*,文語助動詞-ラシ,連用形-ウ音便 +378,助動詞,*,*,*,文語助動詞-ラシ,連用形-一般 +379,助動詞,*,*,*,文語助動詞-ラシ,連用形-補助 +380,助動詞,*,*,*,文語助動詞-ラム,已然形-一般 +381,助動詞,*,*,*,文語助動詞-ラム,終止形-一般 +382,助動詞,*,*,*,文語助動詞-ラム,終止形-撥音便 +383,助動詞,*,*,*,文語助動詞-ラム,連体形-一般 +384,助動詞,*,*,*,文語助動詞-ラム,連体形-撥音便 +385,助動詞,*,*,*,文語助動詞-リ,ク語法 +386,助動詞,*,*,*,文語助動詞-リ,命令形 +387,助動詞,*,*,*,文語助動詞-リ,已然形-一般 +388,助動詞,*,*,*,文語助動詞-リ,未然形-一般 +389,助動詞,*,*,*,文語助動詞-リ,終止形-一般 +390,助動詞,*,*,*,文語助動詞-リ,連体形-一般 +391,助動詞,*,*,*,文語助動詞-リ,連用形-一般 +392,助動詞,*,*,*,文語助動詞-ンス,命令形 +393,助動詞,*,*,*,文語助動詞-ンス,已然形-一般 +394,助動詞,*,*,*,文語助動詞-ンス,未然形-一般 +395,助動詞,*,*,*,文語助動詞-ンス,終止形-一般 +396,助動詞,*,*,*,文語助動詞-ンス,連体形-一般 +397,助動詞,*,*,*,文語助動詞-ンス,連用形-一般 +398,助動詞,*,*,*,文語形容詞-ク,命令形 +399,助動詞,*,*,*,文語形容詞-ク,已然形-一般 +400,助動詞,*,*,*,文語形容詞-ク,已然形-補助 +401,助動詞,*,*,*,文語形容詞-ク,意志推量形 +402,助動詞,*,*,*,文語形容詞-ク,未然形-一般 +403,助動詞,*,*,*,文語形容詞-ク,未然形-補助 +404,助動詞,*,*,*,文語形容詞-ク,終止形-一般 +405,助動詞,*,*,*,文語形容詞-ク,連体形-イ音便 +406,助動詞,*,*,*,文語形容詞-ク,連体形-一般 +407,助動詞,*,*,*,文語形容詞-ク,連体形-撥音便 +408,助動詞,*,*,*,文語形容詞-ク,連体形-補助 +409,助動詞,*,*,*,文語形容詞-ク,連用形-ウ音便 +410,助動詞,*,*,*,文語形容詞-ク,連用形-一般 +411,助動詞,*,*,*,文語形容詞-ク,連用形-補助 +412,助動詞,*,*,*,無変化型,終止形-一般 +413,助詞,係助詞,*,*,*,* +414,助詞,副助詞,*,*,*,* +415,助詞,接続助詞,*,*,*,* +416,助詞,格助詞,*,*,*,* +417,助詞,準体助詞,*,*,*,* +418,助詞,終助詞,*,*,*,* +419,動詞,一般,*,*,カ行変格,仮定形-一般 +420,動詞,一般,*,*,カ行変格,仮定形-融合 +421,動詞,一般,*,*,カ行変格,命令形 +422,動詞,一般,*,*,カ行変格,意志推量形 +423,動詞,一般,*,*,カ行変格,未然形-一般 +424,動詞,一般,*,*,カ行変格,終止形-一般 +425,動詞,一般,*,*,カ行変格,終止形-撥音便 +426,動詞,一般,*,*,カ行変格,連体形-一般 +427,動詞,一般,*,*,カ行変格,連体形-撥音便 +428,動詞,一般,*,*,カ行変格,連体形-省略 +429,動詞,一般,*,*,カ行変格,連用形-一般 +430,動詞,一般,*,*,サ行変格,仮定形-一般 +431,動詞,一般,*,*,サ行変格,仮定形-融合 +432,動詞,一般,*,*,サ行変格,命令形 +433,動詞,一般,*,*,サ行変格,意志推量形 +434,動詞,一般,*,*,サ行変格,未然形-セ +435,動詞,一般,*,*,サ行変格,未然形-一般 +436,動詞,一般,*,*,サ行変格,終止形-一般 +437,動詞,一般,*,*,サ行変格,終止形-撥音便 +438,動詞,一般,*,*,サ行変格,連体形-一般 +439,動詞,一般,*,*,サ行変格,連体形-撥音便 +440,動詞,一般,*,*,サ行変格,連用形-一般 +441,動詞,一般,*,*,上一段-ア行,仮定形-一般 +442,動詞,一般,*,*,上一段-ア行,仮定形-融合 +443,動詞,一般,*,*,上一段-ア行,命令形 +444,動詞,一般,*,*,上一段-ア行,意志推量形 +445,動詞,一般,*,*,上一段-ア行,未然形-一般 +446,動詞,一般,*,*,上一段-ア行,終止形-一般 +447,動詞,一般,*,*,上一段-ア行,終止形-撥音便 +448,動詞,一般,*,*,上一段-ア行,連体形-一般 +449,動詞,一般,*,*,上一段-ア行,連体形-撥音便 +450,動詞,一般,*,*,上一段-ア行,連用形-一般 +451,動詞,一般,*,*,上一段-カ行,仮定形-一般 +452,動詞,一般,*,*,上一段-カ行,仮定形-融合 +453,動詞,一般,*,*,上一段-カ行,命令形 +454,動詞,一般,*,*,上一段-カ行,意志推量形 +455,動詞,一般,*,*,上一段-カ行,未然形-一般 +456,動詞,一般,*,*,上一段-カ行,終止形-一般 +457,動詞,一般,*,*,上一段-カ行,終止形-撥音便 +458,動詞,一般,*,*,上一段-カ行,連体形-一般 +459,動詞,一般,*,*,上一段-カ行,連体形-撥音便 +460,動詞,一般,*,*,上一段-カ行,連体形-省略 +461,動詞,一般,*,*,上一段-カ行,連用形-一般 +462,動詞,一般,*,*,上一段-ガ行,仮定形-一般 +463,動詞,一般,*,*,上一段-ガ行,仮定形-融合 +464,動詞,一般,*,*,上一段-ガ行,命令形 +465,動詞,一般,*,*,上一段-ガ行,意志推量形 +466,動詞,一般,*,*,上一段-ガ行,未然形-一般 +467,動詞,一般,*,*,上一段-ガ行,終止形-一般 +468,動詞,一般,*,*,上一段-ガ行,終止形-撥音便 +469,動詞,一般,*,*,上一段-ガ行,連体形-一般 +470,動詞,一般,*,*,上一段-ガ行,連体形-撥音便 +471,動詞,一般,*,*,上一段-ガ行,連用形-一般 +472,動詞,一般,*,*,上一段-ザ行,仮定形-一般 +473,動詞,一般,*,*,上一段-ザ行,仮定形-融合 +474,動詞,一般,*,*,上一段-ザ行,命令形 +475,動詞,一般,*,*,上一段-ザ行,意志推量形 +476,動詞,一般,*,*,上一段-ザ行,未然形-一般 +477,動詞,一般,*,*,上一段-ザ行,終止形-一般 +478,動詞,一般,*,*,上一段-ザ行,終止形-撥音便 +479,動詞,一般,*,*,上一段-ザ行,連体形-一般 +480,動詞,一般,*,*,上一段-ザ行,連体形-撥音便 +481,動詞,一般,*,*,上一段-ザ行,連体形-省略 +482,動詞,一般,*,*,上一段-ザ行,連用形-一般 +483,動詞,一般,*,*,上一段-タ行,仮定形-一般 +484,動詞,一般,*,*,上一段-タ行,仮定形-融合 +485,動詞,一般,*,*,上一段-タ行,命令形 +486,動詞,一般,*,*,上一段-タ行,意志推量形 +487,動詞,一般,*,*,上一段-タ行,未然形-一般 +488,動詞,一般,*,*,上一段-タ行,終止形-一般 +489,動詞,一般,*,*,上一段-タ行,終止形-撥音便 +490,動詞,一般,*,*,上一段-タ行,連体形-一般 +491,動詞,一般,*,*,上一段-タ行,連体形-撥音便 +492,動詞,一般,*,*,上一段-タ行,連用形-一般 +493,動詞,一般,*,*,上一段-タ行,連用形-促音便 +494,動詞,一般,*,*,上一段-ナ行,仮定形-一般 +495,動詞,一般,*,*,上一段-ナ行,仮定形-融合 +496,動詞,一般,*,*,上一段-ナ行,命令形 +497,動詞,一般,*,*,上一段-ナ行,意志推量形 +498,動詞,一般,*,*,上一段-ナ行,未然形-一般 +499,動詞,一般,*,*,上一段-ナ行,終止形-一般 +500,動詞,一般,*,*,上一段-ナ行,終止形-撥音便 +501,動詞,一般,*,*,上一段-ナ行,連体形-一般 +502,動詞,一般,*,*,上一段-ナ行,連体形-撥音便 +503,動詞,一般,*,*,上一段-ナ行,連用形-一般 +504,動詞,一般,*,*,上一段-ハ行,仮定形-一般 +505,動詞,一般,*,*,上一段-ハ行,仮定形-融合 +506,動詞,一般,*,*,上一段-ハ行,命令形 +507,動詞,一般,*,*,上一段-ハ行,意志推量形 +508,動詞,一般,*,*,上一段-ハ行,未然形-一般 +509,動詞,一般,*,*,上一段-ハ行,終止形-一般 +510,動詞,一般,*,*,上一段-ハ行,終止形-撥音便 +511,動詞,一般,*,*,上一段-ハ行,連体形-一般 +512,動詞,一般,*,*,上一段-ハ行,連体形-撥音便 +513,動詞,一般,*,*,上一段-ハ行,連用形-一般 +514,動詞,一般,*,*,上一段-バ行,仮定形-一般 +515,動詞,一般,*,*,上一段-バ行,仮定形-融合 +516,動詞,一般,*,*,上一段-バ行,命令形 +517,動詞,一般,*,*,上一段-バ行,意志推量形 +518,動詞,一般,*,*,上一段-バ行,未然形-一般 +519,動詞,一般,*,*,上一段-バ行,終止形-一般 +520,動詞,一般,*,*,上一段-バ行,終止形-撥音便 +521,動詞,一般,*,*,上一段-バ行,連体形-一般 +522,動詞,一般,*,*,上一段-バ行,連体形-撥音便 +523,動詞,一般,*,*,上一段-バ行,連用形-一般 +524,動詞,一般,*,*,上一段-バ行,連用形-撥音便 +525,動詞,一般,*,*,上一段-マ行,仮定形-一般 +526,動詞,一般,*,*,上一段-マ行,仮定形-融合 +527,動詞,一般,*,*,上一段-マ行,命令形 +528,動詞,一般,*,*,上一段-マ行,意志推量形 +529,動詞,一般,*,*,上一段-マ行,未然形-一般 +530,動詞,一般,*,*,上一段-マ行,終止形-一般 +531,動詞,一般,*,*,上一段-マ行,終止形-撥音便 +532,動詞,一般,*,*,上一段-マ行,連体形-一般 +533,動詞,一般,*,*,上一段-マ行,連体形-撥音便 +534,動詞,一般,*,*,上一段-マ行,連用形-一般 +535,動詞,一般,*,*,上一段-ラ行,仮定形-一般 +536,動詞,一般,*,*,上一段-ラ行,仮定形-融合 +537,動詞,一般,*,*,上一段-ラ行,命令形 +538,動詞,一般,*,*,上一段-ラ行,意志推量形 +539,動詞,一般,*,*,上一段-ラ行,未然形-一般 +540,動詞,一般,*,*,上一段-ラ行,未然形-撥音便 +541,動詞,一般,*,*,上一段-ラ行,終止形-一般 +542,動詞,一般,*,*,上一段-ラ行,終止形-撥音便 +543,動詞,一般,*,*,上一段-ラ行,連体形-一般 +544,動詞,一般,*,*,上一段-ラ行,連体形-撥音便 +545,動詞,一般,*,*,上一段-ラ行,連用形-一般 +546,動詞,一般,*,*,下一段-ア行,仮定形-一般 +547,動詞,一般,*,*,下一段-ア行,仮定形-融合 +548,動詞,一般,*,*,下一段-ア行,命令形 +549,動詞,一般,*,*,下一段-ア行,意志推量形 +550,動詞,一般,*,*,下一段-ア行,未然形-一般 +551,動詞,一般,*,*,下一段-ア行,終止形-一般 +552,動詞,一般,*,*,下一段-ア行,終止形-促音便 +553,動詞,一般,*,*,下一段-ア行,終止形-撥音便 +554,動詞,一般,*,*,下一段-ア行,連体形-一般 +555,動詞,一般,*,*,下一段-ア行,連体形-撥音便 +556,動詞,一般,*,*,下一段-ア行,連体形-省略 +557,動詞,一般,*,*,下一段-ア行,連用形-一般 +558,動詞,一般,*,*,下一段-ア行,連用形-一般+送り仮名省略 +559,動詞,一般,*,*,下一段-カ行,仮定形-一般 +560,動詞,一般,*,*,下一段-カ行,仮定形-融合 +561,動詞,一般,*,*,下一段-カ行,命令形 +562,動詞,一般,*,*,下一段-カ行,意志推量形 +563,動詞,一般,*,*,下一段-カ行,未然形-一般 +564,動詞,一般,*,*,下一段-カ行,終止形-一般 +565,動詞,一般,*,*,下一段-カ行,終止形-撥音便 +566,動詞,一般,*,*,下一段-カ行,連体形-一般 +567,動詞,一般,*,*,下一段-カ行,連体形-撥音便 +568,動詞,一般,*,*,下一段-カ行,連用形-一般 +569,動詞,一般,*,*,下一段-ガ行,仮定形-一般 +570,動詞,一般,*,*,下一段-ガ行,仮定形-融合 +571,動詞,一般,*,*,下一段-ガ行,命令形 +572,動詞,一般,*,*,下一段-ガ行,意志推量形 +573,動詞,一般,*,*,下一段-ガ行,未然形-一般 +574,動詞,一般,*,*,下一段-ガ行,終止形-一般 +575,動詞,一般,*,*,下一段-ガ行,終止形-撥音便 +576,動詞,一般,*,*,下一段-ガ行,連体形-一般 +577,動詞,一般,*,*,下一段-ガ行,連体形-撥音便 +578,動詞,一般,*,*,下一段-ガ行,連用形-一般 +579,動詞,一般,*,*,下一段-サ行,仮定形-一般 +580,動詞,一般,*,*,下一段-サ行,仮定形-融合 +581,動詞,一般,*,*,下一段-サ行,命令形 +582,動詞,一般,*,*,下一段-サ行,意志推量形 +583,動詞,一般,*,*,下一段-サ行,未然形-一般 +584,動詞,一般,*,*,下一段-サ行,終止形-一般 +585,動詞,一般,*,*,下一段-サ行,終止形-撥音便 +586,動詞,一般,*,*,下一段-サ行,連体形-一般 +587,動詞,一般,*,*,下一段-サ行,連体形-撥音便 +588,動詞,一般,*,*,下一段-サ行,連体形-省略 +589,動詞,一般,*,*,下一段-サ行,連用形-一般 +590,動詞,一般,*,*,下一段-ザ行,仮定形-一般 +591,動詞,一般,*,*,下一段-ザ行,仮定形-融合 +592,動詞,一般,*,*,下一段-ザ行,命令形 +593,動詞,一般,*,*,下一段-ザ行,意志推量形 +594,動詞,一般,*,*,下一段-ザ行,未然形-一般 +595,動詞,一般,*,*,下一段-ザ行,終止形-一般 +596,動詞,一般,*,*,下一段-ザ行,終止形-撥音便 +597,動詞,一般,*,*,下一段-ザ行,連体形-一般 +598,動詞,一般,*,*,下一段-ザ行,連体形-撥音便 +599,動詞,一般,*,*,下一段-ザ行,連用形-一般 +600,動詞,一般,*,*,下一段-タ行,仮定形-一般 +601,動詞,一般,*,*,下一段-タ行,仮定形-融合 +602,動詞,一般,*,*,下一段-タ行,命令形 +603,動詞,一般,*,*,下一段-タ行,意志推量形 +604,動詞,一般,*,*,下一段-タ行,未然形-一般 +605,動詞,一般,*,*,下一段-タ行,終止形-一般 +606,動詞,一般,*,*,下一段-タ行,終止形-撥音便 +607,動詞,一般,*,*,下一段-タ行,連体形-一般 +608,動詞,一般,*,*,下一段-タ行,連体形-撥音便 +609,動詞,一般,*,*,下一段-タ行,連用形-一般 +610,動詞,一般,*,*,下一段-ダ行,仮定形-一般 +611,動詞,一般,*,*,下一段-ダ行,仮定形-融合 +612,動詞,一般,*,*,下一段-ダ行,命令形 +613,動詞,一般,*,*,下一段-ダ行,意志推量形 +614,動詞,一般,*,*,下一段-ダ行,未然形-一般 +615,動詞,一般,*,*,下一段-ダ行,終止形-一般 +616,動詞,一般,*,*,下一段-ダ行,終止形-撥音便 +617,動詞,一般,*,*,下一段-ダ行,連体形-一般 +618,動詞,一般,*,*,下一段-ダ行,連体形-撥音便 +619,動詞,一般,*,*,下一段-ダ行,連用形-一般 +620,動詞,一般,*,*,下一段-ナ行,仮定形-一般 +621,動詞,一般,*,*,下一段-ナ行,仮定形-融合 +622,動詞,一般,*,*,下一段-ナ行,命令形 +623,動詞,一般,*,*,下一段-ナ行,意志推量形 +624,動詞,一般,*,*,下一段-ナ行,未然形-一般 +625,動詞,一般,*,*,下一段-ナ行,終止形-一般 +626,動詞,一般,*,*,下一段-ナ行,終止形-撥音便 +627,動詞,一般,*,*,下一段-ナ行,連体形-一般 +628,動詞,一般,*,*,下一段-ナ行,連体形-撥音便 +629,動詞,一般,*,*,下一段-ナ行,連用形-一般 +630,動詞,一般,*,*,下一段-ハ行,仮定形-一般 +631,動詞,一般,*,*,下一段-ハ行,仮定形-融合 +632,動詞,一般,*,*,下一段-ハ行,命令形 +633,動詞,一般,*,*,下一段-ハ行,意志推量形 +634,動詞,一般,*,*,下一段-ハ行,未然形-一般 +635,動詞,一般,*,*,下一段-ハ行,終止形-一般 +636,動詞,一般,*,*,下一段-ハ行,終止形-撥音便 +637,動詞,一般,*,*,下一段-ハ行,連体形-一般 +638,動詞,一般,*,*,下一段-ハ行,連体形-撥音便 +639,動詞,一般,*,*,下一段-ハ行,連用形-一般 +640,動詞,一般,*,*,下一段-バ行,仮定形-一般 +641,動詞,一般,*,*,下一段-バ行,仮定形-融合 +642,動詞,一般,*,*,下一段-バ行,命令形 +643,動詞,一般,*,*,下一段-バ行,意志推量形 +644,動詞,一般,*,*,下一段-バ行,未然形-一般 +645,動詞,一般,*,*,下一段-バ行,終止形-一般 +646,動詞,一般,*,*,下一段-バ行,終止形-撥音便 +647,動詞,一般,*,*,下一段-バ行,連体形-一般 +648,動詞,一般,*,*,下一段-バ行,連体形-撥音便 +649,動詞,一般,*,*,下一段-バ行,連用形-一般 +650,動詞,一般,*,*,下一段-マ行,仮定形-一般 +651,動詞,一般,*,*,下一段-マ行,仮定形-融合 +652,動詞,一般,*,*,下一段-マ行,命令形 +653,動詞,一般,*,*,下一段-マ行,意志推量形 +654,動詞,一般,*,*,下一段-マ行,未然形-一般 +655,動詞,一般,*,*,下一段-マ行,終止形-一般 +656,動詞,一般,*,*,下一段-マ行,終止形-撥音便 +657,動詞,一般,*,*,下一段-マ行,連体形-一般 +658,動詞,一般,*,*,下一段-マ行,連体形-撥音便 +659,動詞,一般,*,*,下一段-マ行,連用形-一般 +660,動詞,一般,*,*,下一段-ラ行,仮定形-一般 +661,動詞,一般,*,*,下一段-ラ行,仮定形-融合 +662,動詞,一般,*,*,下一段-ラ行,命令形 +663,動詞,一般,*,*,下一段-ラ行,意志推量形 +664,動詞,一般,*,*,下一段-ラ行,未然形-一般 +665,動詞,一般,*,*,下一段-ラ行,未然形-撥音便 +666,動詞,一般,*,*,下一段-ラ行,終止形-一般 +667,動詞,一般,*,*,下一段-ラ行,終止形-撥音便 +668,動詞,一般,*,*,下一段-ラ行,連体形-一般 +669,動詞,一般,*,*,下一段-ラ行,連体形-撥音便 +670,動詞,一般,*,*,下一段-ラ行,連体形-省略 +671,動詞,一般,*,*,下一段-ラ行,連用形-一般 +672,動詞,一般,*,*,五段-カ行,仮定形-一般 +673,動詞,一般,*,*,五段-カ行,仮定形-融合 +674,動詞,一般,*,*,五段-カ行,命令形 +675,動詞,一般,*,*,五段-カ行,意志推量形 +676,動詞,一般,*,*,五段-カ行,未然形-一般 +677,動詞,一般,*,*,五段-カ行,終止形-一般 +678,動詞,一般,*,*,五段-カ行,連体形-一般 +679,動詞,一般,*,*,五段-カ行,連用形-イ音便 +680,動詞,一般,*,*,五段-カ行,連用形-一般 +681,動詞,一般,*,*,五段-カ行,連用形-促音便 +682,動詞,一般,*,*,五段-カ行,連用形-融合 +683,動詞,一般,*,*,五段-ガ行,仮定形-一般 +684,動詞,一般,*,*,五段-ガ行,仮定形-融合 +685,動詞,一般,*,*,五段-ガ行,命令形 +686,動詞,一般,*,*,五段-ガ行,意志推量形 +687,動詞,一般,*,*,五段-ガ行,未然形-一般 +688,動詞,一般,*,*,五段-ガ行,終止形-一般 +689,動詞,一般,*,*,五段-ガ行,連体形-一般 +690,動詞,一般,*,*,五段-ガ行,連用形-イ音便 +691,動詞,一般,*,*,五段-ガ行,連用形-一般 +692,動詞,一般,*,*,五段-ガ行,連用形-融合 +693,動詞,一般,*,*,五段-サ行,仮定形-一般 +694,動詞,一般,*,*,五段-サ行,仮定形-融合 +695,動詞,一般,*,*,五段-サ行,命令形 +696,動詞,一般,*,*,五段-サ行,意志推量形 +697,動詞,一般,*,*,五段-サ行,未然形-一般 +698,動詞,一般,*,*,五段-サ行,終止形-一般 +699,動詞,一般,*,*,五段-サ行,連体形-一般 +700,動詞,一般,*,*,五段-サ行,連用形-一般 +701,動詞,一般,*,*,五段-サ行,連用形-融合 +702,動詞,一般,*,*,五段-タ行,仮定形-一般 +703,動詞,一般,*,*,五段-タ行,仮定形-融合 +704,動詞,一般,*,*,五段-タ行,命令形 +705,動詞,一般,*,*,五段-タ行,意志推量形 +706,動詞,一般,*,*,五段-タ行,未然形-一般 +707,動詞,一般,*,*,五段-タ行,終止形-一般 +708,動詞,一般,*,*,五段-タ行,連体形-一般 +709,動詞,一般,*,*,五段-タ行,連用形-一般 +710,動詞,一般,*,*,五段-タ行,連用形-促音便 +711,動詞,一般,*,*,五段-タ行,連用形-融合 +712,動詞,一般,*,*,五段-ナ行,仮定形-一般 +713,動詞,一般,*,*,五段-ナ行,仮定形-融合 +714,動詞,一般,*,*,五段-ナ行,命令形 +715,動詞,一般,*,*,五段-ナ行,意志推量形 +716,動詞,一般,*,*,五段-ナ行,未然形-一般 +717,動詞,一般,*,*,五段-ナ行,終止形-一般 +718,動詞,一般,*,*,五段-ナ行,連体形-一般 +719,動詞,一般,*,*,五段-ナ行,連用形-一般 +720,動詞,一般,*,*,五段-ナ行,連用形-撥音便 +721,動詞,一般,*,*,五段-ナ行,連用形-融合 +722,動詞,一般,*,*,五段-バ行,仮定形-一般 +723,動詞,一般,*,*,五段-バ行,仮定形-融合 +724,動詞,一般,*,*,五段-バ行,命令形 +725,動詞,一般,*,*,五段-バ行,意志推量形 +726,動詞,一般,*,*,五段-バ行,未然形-一般 +727,動詞,一般,*,*,五段-バ行,終止形-一般 +728,動詞,一般,*,*,五段-バ行,連体形-一般 +729,動詞,一般,*,*,五段-バ行,連用形-一般 +730,動詞,一般,*,*,五段-バ行,連用形-撥音便 +731,動詞,一般,*,*,五段-バ行,連用形-融合 +732,動詞,一般,*,*,五段-マ行,仮定形-一般 +733,動詞,一般,*,*,五段-マ行,仮定形-融合 +734,動詞,一般,*,*,五段-マ行,命令形 +735,動詞,一般,*,*,五段-マ行,意志推量形 +736,動詞,一般,*,*,五段-マ行,未然形-一般 +737,動詞,一般,*,*,五段-マ行,終止形-一般 +738,動詞,一般,*,*,五段-マ行,連体形-一般 +739,動詞,一般,*,*,五段-マ行,連用形-イ音便 +740,動詞,一般,*,*,五段-マ行,連用形-一般 +741,動詞,一般,*,*,五段-マ行,連用形-撥音便 +742,動詞,一般,*,*,五段-マ行,連用形-融合 +743,動詞,一般,*,*,五段-ラ行,仮定形-一般 +744,動詞,一般,*,*,五段-ラ行,仮定形-融合 +745,動詞,一般,*,*,五段-ラ行,命令形 +746,動詞,一般,*,*,五段-ラ行,意志推量形 +747,動詞,一般,*,*,五段-ラ行,未然形-一般 +748,動詞,一般,*,*,五段-ラ行,未然形-撥音便 +749,動詞,一般,*,*,五段-ラ行,終止形-一般 +750,動詞,一般,*,*,五段-ラ行,終止形-撥音便 +751,動詞,一般,*,*,五段-ラ行,連体形-一般 +752,動詞,一般,*,*,五段-ラ行,連体形-一般+送り仮名省略 +753,動詞,一般,*,*,五段-ラ行,連体形-撥音便 +754,動詞,一般,*,*,五段-ラ行,連用形-イ音便 +755,動詞,一般,*,*,五段-ラ行,連用形-一般 +756,動詞,一般,*,*,五段-ラ行,連用形-促音便 +757,動詞,一般,*,*,五段-ラ行,連用形-撥音便 +758,動詞,一般,*,*,五段-ラ行,連用形-融合 +759,動詞,一般,*,*,五段-ワア行,仮定形-一般 +760,動詞,一般,*,*,五段-ワア行,仮定形-融合 +761,動詞,一般,*,*,五段-ワア行,命令形 +762,動詞,一般,*,*,五段-ワア行,意志推量形 +763,動詞,一般,*,*,五段-ワア行,未然形-一般 +764,動詞,一般,*,*,五段-ワア行,終止形-一般 +765,動詞,一般,*,*,五段-ワア行,連体形-一般 +766,動詞,一般,*,*,五段-ワア行,連用形-ウ音便 +767,動詞,一般,*,*,五段-ワア行,連用形-一般 +768,動詞,一般,*,*,五段-ワア行,連用形-促音便 +769,動詞,一般,*,*,文語サ行変格,ク語法 +770,動詞,一般,*,*,文語サ行変格,命令形 +771,動詞,一般,*,*,文語サ行変格,已然形-一般 +772,動詞,一般,*,*,文語サ行変格,未然形-一般 +773,動詞,一般,*,*,文語サ行変格,終止形-一般 +774,動詞,一般,*,*,文語サ行変格,連体形-一般 +775,動詞,一般,*,*,文語サ行変格,連用形-一般 +776,動詞,一般,*,*,文語ナ行変格,命令形 +777,動詞,一般,*,*,文語ナ行変格,已然形-一般 +778,動詞,一般,*,*,文語ナ行変格,意志推量形 +779,動詞,一般,*,*,文語ナ行変格,未然形-一般 +780,動詞,一般,*,*,文語ナ行変格,終止形-一般 +781,動詞,一般,*,*,文語ナ行変格,連体形-一般 +782,動詞,一般,*,*,文語ナ行変格,連用形-一般 +783,動詞,一般,*,*,文語ナ行変格,連用形-撥音便 +784,動詞,一般,*,*,文語ラ行変格,命令形 +785,動詞,一般,*,*,文語ラ行変格,已然形-一般 +786,動詞,一般,*,*,文語ラ行変格,意志推量形 +787,動詞,一般,*,*,文語ラ行変格,未然形-一般 +788,動詞,一般,*,*,文語ラ行変格,終止形-一般 +789,動詞,一般,*,*,文語ラ行変格,終止形-撥音便 +790,動詞,一般,*,*,文語ラ行変格,連体形-一般 +791,動詞,一般,*,*,文語ラ行変格,連体形-撥音便 +792,動詞,一般,*,*,文語ラ行変格,連用形-一般 +793,動詞,一般,*,*,文語ラ行変格,連用形-促音便 +794,動詞,一般,*,*,文語上一段-ナ行,命令形 +795,動詞,一般,*,*,文語上一段-ナ行,已然形-一般 +796,動詞,一般,*,*,文語上一段-ナ行,未然形-一般 +797,動詞,一般,*,*,文語上一段-ナ行,終止形-一般 +798,動詞,一般,*,*,文語上一段-ナ行,連体形-一般 +799,動詞,一般,*,*,文語上一段-ナ行,連用形-一般 +800,動詞,一般,*,*,文語上一段-マ行,命令形 +801,動詞,一般,*,*,文語上一段-マ行,已然形-一般 +802,動詞,一般,*,*,文語上一段-マ行,未然形-一般 +803,動詞,一般,*,*,文語上一段-マ行,終止形-一般 +804,動詞,一般,*,*,文語上一段-マ行,連体形-一般 +805,動詞,一般,*,*,文語上一段-マ行,連用形-一般 +806,動詞,一般,*,*,文語上一段-ワ行,命令形 +807,動詞,一般,*,*,文語上一段-ワ行,已然形-一般 +808,動詞,一般,*,*,文語上一段-ワ行,未然形-一般 +809,動詞,一般,*,*,文語上一段-ワ行,終止形-一般 +810,動詞,一般,*,*,文語上一段-ワ行,連体形-一般 +811,動詞,一般,*,*,文語上一段-ワ行,連用形-一般 +812,動詞,一般,*,*,文語上二段-タ行,命令形 +813,動詞,一般,*,*,文語上二段-タ行,已然形-一般 +814,動詞,一般,*,*,文語上二段-タ行,未然形-一般 +815,動詞,一般,*,*,文語上二段-タ行,終止形-一般 +816,動詞,一般,*,*,文語上二段-タ行,連体形-一般 +817,動詞,一般,*,*,文語上二段-タ行,連用形-一般 +818,動詞,一般,*,*,文語上二段-ダ行,命令形 +819,動詞,一般,*,*,文語上二段-ダ行,已然形-一般 +820,動詞,一般,*,*,文語上二段-ダ行,未然形-一般 +821,動詞,一般,*,*,文語上二段-ダ行,終止形-一般 +822,動詞,一般,*,*,文語上二段-ダ行,連体形-一般 +823,動詞,一般,*,*,文語上二段-ダ行,連用形-一般 +824,動詞,一般,*,*,文語上二段-ハ行,命令形 +825,動詞,一般,*,*,文語上二段-ハ行,已然形-一般 +826,動詞,一般,*,*,文語上二段-ハ行,未然形-一般 +827,動詞,一般,*,*,文語上二段-ハ行,終止形-一般 +828,動詞,一般,*,*,文語上二段-ハ行,連体形-一般 +829,動詞,一般,*,*,文語上二段-ハ行,連用形-イ音便+送り仮名省略 +830,動詞,一般,*,*,文語上二段-ハ行,連用形-一般 +831,動詞,一般,*,*,文語上二段-バ行,命令形 +832,動詞,一般,*,*,文語上二段-バ行,已然形-一般 +833,動詞,一般,*,*,文語上二段-バ行,未然形-一般 +834,動詞,一般,*,*,文語上二段-バ行,終止形-一般 +835,動詞,一般,*,*,文語上二段-バ行,連体形-一般 +836,動詞,一般,*,*,文語上二段-バ行,連用形-一般 +837,動詞,一般,*,*,文語上二段-ヤ行,命令形 +838,動詞,一般,*,*,文語上二段-ヤ行,已然形-一般 +839,動詞,一般,*,*,文語上二段-ヤ行,未然形-一般 +840,動詞,一般,*,*,文語上二段-ヤ行,終止形-一般 +841,動詞,一般,*,*,文語上二段-ヤ行,連体形-一般 +842,動詞,一般,*,*,文語上二段-ヤ行,連用形-一般 +843,動詞,一般,*,*,文語上二段-ラ行,命令形 +844,動詞,一般,*,*,文語上二段-ラ行,已然形-一般 +845,動詞,一般,*,*,文語上二段-ラ行,未然形-一般 +846,動詞,一般,*,*,文語上二段-ラ行,終止形-一般 +847,動詞,一般,*,*,文語上二段-ラ行,連体形-一般 +848,動詞,一般,*,*,文語上二段-ラ行,連用形-一般 +849,動詞,一般,*,*,文語下二段-ア行,命令形 +850,動詞,一般,*,*,文語下二段-ア行,已然形-一般 +851,動詞,一般,*,*,文語下二段-ア行,未然形-一般 +852,動詞,一般,*,*,文語下二段-ア行,終止形-一般 +853,動詞,一般,*,*,文語下二段-ア行,連体形-一般 +854,動詞,一般,*,*,文語下二段-ア行,連用形-一般 +855,動詞,一般,*,*,文語下二段-カ行,命令形 +856,動詞,一般,*,*,文語下二段-カ行,已然形-一般 +857,動詞,一般,*,*,文語下二段-カ行,未然形-一般 +858,動詞,一般,*,*,文語下二段-カ行,終止形-一般 +859,動詞,一般,*,*,文語下二段-カ行,連体形-一般 +860,動詞,一般,*,*,文語下二段-カ行,連用形-一般 +861,動詞,一般,*,*,文語下二段-ガ行,命令形 +862,動詞,一般,*,*,文語下二段-ガ行,已然形-一般 +863,動詞,一般,*,*,文語下二段-ガ行,未然形-一般 +864,動詞,一般,*,*,文語下二段-ガ行,終止形-一般 +865,動詞,一般,*,*,文語下二段-ガ行,連体形-一般 +866,動詞,一般,*,*,文語下二段-ガ行,連用形-一般 +867,動詞,一般,*,*,文語下二段-サ行,命令形 +868,動詞,一般,*,*,文語下二段-サ行,已然形-一般 +869,動詞,一般,*,*,文語下二段-サ行,未然形-一般 +870,動詞,一般,*,*,文語下二段-サ行,終止形-一般 +871,動詞,一般,*,*,文語下二段-サ行,連体形-一般 +872,動詞,一般,*,*,文語下二段-サ行,連用形-一般 +873,動詞,一般,*,*,文語下二段-ザ行,命令形 +874,動詞,一般,*,*,文語下二段-ザ行,已然形-一般 +875,動詞,一般,*,*,文語下二段-ザ行,未然形-一般 +876,動詞,一般,*,*,文語下二段-ザ行,終止形-一般 +877,動詞,一般,*,*,文語下二段-ザ行,連体形-一般 +878,動詞,一般,*,*,文語下二段-ザ行,連用形-一般 +879,動詞,一般,*,*,文語下二段-タ行,命令形 +880,動詞,一般,*,*,文語下二段-タ行,已然形-一般 +881,動詞,一般,*,*,文語下二段-タ行,未然形-一般 +882,動詞,一般,*,*,文語下二段-タ行,終止形-一般 +883,動詞,一般,*,*,文語下二段-タ行,連体形-一般 +884,動詞,一般,*,*,文語下二段-タ行,連用形-一般 +885,動詞,一般,*,*,文語下二段-ダ行,命令形 +886,動詞,一般,*,*,文語下二段-ダ行,已然形-一般 +887,動詞,一般,*,*,文語下二段-ダ行,未然形-一般 +888,動詞,一般,*,*,文語下二段-ダ行,終止形-一般 +889,動詞,一般,*,*,文語下二段-ダ行,連体形-一般 +890,動詞,一般,*,*,文語下二段-ダ行,連用形-一般 +891,動詞,一般,*,*,文語下二段-ナ行,命令形 +892,動詞,一般,*,*,文語下二段-ナ行,已然形-一般 +893,動詞,一般,*,*,文語下二段-ナ行,未然形-一般 +894,動詞,一般,*,*,文語下二段-ナ行,終止形-一般 +895,動詞,一般,*,*,文語下二段-ナ行,連体形-一般 +896,動詞,一般,*,*,文語下二段-ナ行,連用形-一般 +897,動詞,一般,*,*,文語下二段-ハ行,命令形 +898,動詞,一般,*,*,文語下二段-ハ行,已然形-一般 +899,動詞,一般,*,*,文語下二段-ハ行,未然形-一般 +900,動詞,一般,*,*,文語下二段-ハ行,終止形-一般 +901,動詞,一般,*,*,文語下二段-ハ行,連体形-一般 +902,動詞,一般,*,*,文語下二段-ハ行,連用形-一般 +903,動詞,一般,*,*,文語下二段-バ行,命令形 +904,動詞,一般,*,*,文語下二段-バ行,已然形-一般 +905,動詞,一般,*,*,文語下二段-バ行,未然形-一般 +906,動詞,一般,*,*,文語下二段-バ行,終止形-一般 +907,動詞,一般,*,*,文語下二段-バ行,連体形-一般 +908,動詞,一般,*,*,文語下二段-バ行,連用形-一般 +909,動詞,一般,*,*,文語下二段-マ行,命令形 +910,動詞,一般,*,*,文語下二段-マ行,已然形-一般 +911,動詞,一般,*,*,文語下二段-マ行,未然形-一般 +912,動詞,一般,*,*,文語下二段-マ行,終止形-一般 +913,動詞,一般,*,*,文語下二段-マ行,連体形-一般 +914,動詞,一般,*,*,文語下二段-マ行,連用形-一般 +915,動詞,一般,*,*,文語下二段-ヤ行,命令形 +916,動詞,一般,*,*,文語下二段-ヤ行,已然形-一般 +917,動詞,一般,*,*,文語下二段-ヤ行,未然形-一般 +918,動詞,一般,*,*,文語下二段-ヤ行,終止形-一般 +919,動詞,一般,*,*,文語下二段-ヤ行,連体形-一般 +920,動詞,一般,*,*,文語下二段-ヤ行,連用形-一般 +921,動詞,一般,*,*,文語下二段-ラ行,命令形 +922,動詞,一般,*,*,文語下二段-ラ行,已然形-一般 +923,動詞,一般,*,*,文語下二段-ラ行,未然形-一般 +924,動詞,一般,*,*,文語下二段-ラ行,終止形-一般 +925,動詞,一般,*,*,文語下二段-ラ行,連体形-一般 +926,動詞,一般,*,*,文語下二段-ラ行,連用形-一般 +927,動詞,一般,*,*,文語下二段-ワ行,命令形 +928,動詞,一般,*,*,文語下二段-ワ行,已然形-一般 +929,動詞,一般,*,*,文語下二段-ワ行,未然形-一般 +930,動詞,一般,*,*,文語下二段-ワ行,終止形-一般 +931,動詞,一般,*,*,文語下二段-ワ行,連体形-一般 +932,動詞,一般,*,*,文語下二段-ワ行,連用形-一般 +933,動詞,一般,*,*,文語四段-カ行,命令形 +934,動詞,一般,*,*,文語四段-カ行,已然形-一般 +935,動詞,一般,*,*,文語四段-カ行,意志推量形 +936,動詞,一般,*,*,文語四段-カ行,未然形-一般 +937,動詞,一般,*,*,文語四段-カ行,終止形-一般 +938,動詞,一般,*,*,文語四段-カ行,連体形-一般 +939,動詞,一般,*,*,文語四段-カ行,連用形-イ音便 +940,動詞,一般,*,*,文語四段-カ行,連用形-一般 +941,動詞,一般,*,*,文語四段-ガ行,命令形 +942,動詞,一般,*,*,文語四段-ガ行,已然形-一般 +943,動詞,一般,*,*,文語四段-ガ行,意志推量形 +944,動詞,一般,*,*,文語四段-ガ行,未然形-一般 +945,動詞,一般,*,*,文語四段-ガ行,終止形-一般 +946,動詞,一般,*,*,文語四段-ガ行,連体形-一般 +947,動詞,一般,*,*,文語四段-ガ行,連用形-イ音便 +948,動詞,一般,*,*,文語四段-ガ行,連用形-一般 +949,動詞,一般,*,*,文語四段-サ行,命令形 +950,動詞,一般,*,*,文語四段-サ行,已然形-一般 +951,動詞,一般,*,*,文語四段-サ行,意志推量形 +952,動詞,一般,*,*,文語四段-サ行,未然形-一般 +953,動詞,一般,*,*,文語四段-サ行,終止形-一般 +954,動詞,一般,*,*,文語四段-サ行,連体形-一般 +955,動詞,一般,*,*,文語四段-サ行,連用形-イ音便 +956,動詞,一般,*,*,文語四段-サ行,連用形-キ接続 +957,動詞,一般,*,*,文語四段-サ行,連用形-一般 +958,動詞,一般,*,*,文語四段-タ行,命令形 +959,動詞,一般,*,*,文語四段-タ行,已然形-一般 +960,動詞,一般,*,*,文語四段-タ行,意志推量形 +961,動詞,一般,*,*,文語四段-タ行,未然形-一般 +962,動詞,一般,*,*,文語四段-タ行,終止形-一般 +963,動詞,一般,*,*,文語四段-タ行,連体形-一般 +964,動詞,一般,*,*,文語四段-タ行,連用形-一般 +965,動詞,一般,*,*,文語四段-タ行,連用形-促音便 +966,動詞,一般,*,*,文語四段-ハ行,ク語法 +967,動詞,一般,*,*,文語四段-ハ行,命令形 +968,動詞,一般,*,*,文語四段-ハ行,已然形-一般 +969,動詞,一般,*,*,文語四段-ハ行,意志推量形 +970,動詞,一般,*,*,文語四段-ハ行,未然形-一般 +971,動詞,一般,*,*,文語四段-ハ行,終止形-ウ音便 +972,動詞,一般,*,*,文語四段-ハ行,終止形-一般 +973,動詞,一般,*,*,文語四段-ハ行,連体形-ウ音便 +974,動詞,一般,*,*,文語四段-ハ行,連体形-一般 +975,動詞,一般,*,*,文語四段-ハ行,連用形-イ音便 +976,動詞,一般,*,*,文語四段-ハ行,連用形-ウ音便 +977,動詞,一般,*,*,文語四段-ハ行,連用形-一般 +978,動詞,一般,*,*,文語四段-ハ行,連用形-促音便 +979,動詞,一般,*,*,文語四段-バ行,命令形 +980,動詞,一般,*,*,文語四段-バ行,已然形-一般 +981,動詞,一般,*,*,文語四段-バ行,意志推量形 +982,動詞,一般,*,*,文語四段-バ行,未然形-一般 +983,動詞,一般,*,*,文語四段-バ行,終止形-一般 +984,動詞,一般,*,*,文語四段-バ行,連体形-一般 +985,動詞,一般,*,*,文語四段-バ行,連用形-ウ音便 +986,動詞,一般,*,*,文語四段-バ行,連用形-一般 +987,動詞,一般,*,*,文語四段-バ行,連用形-撥音便 +988,動詞,一般,*,*,文語四段-マ行,命令形 +989,動詞,一般,*,*,文語四段-マ行,已然形-一般 +990,動詞,一般,*,*,文語四段-マ行,意志推量形 +991,動詞,一般,*,*,文語四段-マ行,未然形-一般 +992,動詞,一般,*,*,文語四段-マ行,終止形-一般 +993,動詞,一般,*,*,文語四段-マ行,連体形-一般 +994,動詞,一般,*,*,文語四段-マ行,連用形-ウ音便 +995,動詞,一般,*,*,文語四段-マ行,連用形-一般 +996,動詞,一般,*,*,文語四段-マ行,連用形-撥音便 +997,動詞,一般,*,*,文語四段-ラ行,命令形 +998,動詞,一般,*,*,文語四段-ラ行,已然形-一般 +999,動詞,一般,*,*,文語四段-ラ行,意志推量形 +1000,動詞,一般,*,*,文語四段-ラ行,未然形-一般 +1001,動詞,一般,*,*,文語四段-ラ行,終止形-一般 +1002,動詞,一般,*,*,文語四段-ラ行,連体形-一般 +1003,動詞,一般,*,*,文語四段-ラ行,連用形-一般 +1004,動詞,一般,*,*,文語四段-ラ行,連用形-促音便 +1005,動詞,非自立可能,*,*,カ行変格,仮定形-一般 +1006,動詞,非自立可能,*,*,カ行変格,仮定形-融合 +1007,動詞,非自立可能,*,*,カ行変格,命令形 +1008,動詞,非自立可能,*,*,カ行変格,意志推量形 +1009,動詞,非自立可能,*,*,カ行変格,未然形-一般 +1010,動詞,非自立可能,*,*,カ行変格,終止形-一般 +1011,動詞,非自立可能,*,*,カ行変格,終止形-撥音便 +1012,動詞,非自立可能,*,*,カ行変格,連体形-一般 +1013,動詞,非自立可能,*,*,カ行変格,連体形-撥音便 +1014,動詞,非自立可能,*,*,カ行変格,連体形-省略 +1015,動詞,非自立可能,*,*,カ行変格,連用形-一般 +1016,動詞,非自立可能,*,*,サ行変格,仮定形-一般 +1017,動詞,非自立可能,*,*,サ行変格,仮定形-融合 +1018,動詞,非自立可能,*,*,サ行変格,命令形 +1019,動詞,非自立可能,*,*,サ行変格,意志推量形 +1020,動詞,非自立可能,*,*,サ行変格,未然形-サ +1021,動詞,非自立可能,*,*,サ行変格,未然形-セ +1022,動詞,非自立可能,*,*,サ行変格,未然形-一般 +1023,動詞,非自立可能,*,*,サ行変格,終止形-一般 +1024,動詞,非自立可能,*,*,サ行変格,終止形-撥音便 +1025,動詞,非自立可能,*,*,サ行変格,連体形-一般 +1026,動詞,非自立可能,*,*,サ行変格,連体形-撥音便 +1027,動詞,非自立可能,*,*,サ行変格,連体形-省略 +1028,動詞,非自立可能,*,*,サ行変格,連用形-一般 +1029,動詞,非自立可能,*,*,上一段-ア行,仮定形-一般 +1030,動詞,非自立可能,*,*,上一段-ア行,仮定形-融合 +1031,動詞,非自立可能,*,*,上一段-ア行,命令形 +1032,動詞,非自立可能,*,*,上一段-ア行,意志推量形 +1033,動詞,非自立可能,*,*,上一段-ア行,未然形-一般 +1034,動詞,非自立可能,*,*,上一段-ア行,終止形-一般 +1035,動詞,非自立可能,*,*,上一段-ア行,終止形-撥音便 +1036,動詞,非自立可能,*,*,上一段-ア行,連体形-一般 +1037,動詞,非自立可能,*,*,上一段-ア行,連体形-撥音便 +1038,動詞,非自立可能,*,*,上一段-ア行,連体形-省略 +1039,動詞,非自立可能,*,*,上一段-ア行,連用形-一般 +1040,動詞,非自立可能,*,*,上一段-カ行,仮定形-一般 +1041,動詞,非自立可能,*,*,上一段-カ行,仮定形-融合 +1042,動詞,非自立可能,*,*,上一段-カ行,命令形 +1043,動詞,非自立可能,*,*,上一段-カ行,意志推量形 +1044,動詞,非自立可能,*,*,上一段-カ行,未然形-一般 +1045,動詞,非自立可能,*,*,上一段-カ行,終止形-一般 +1046,動詞,非自立可能,*,*,上一段-カ行,終止形-撥音便 +1047,動詞,非自立可能,*,*,上一段-カ行,連体形-一般 +1048,動詞,非自立可能,*,*,上一段-カ行,連体形-撥音便 +1049,動詞,非自立可能,*,*,上一段-カ行,連用形-一般 +1050,動詞,非自立可能,*,*,上一段-ガ行,仮定形-一般 +1051,動詞,非自立可能,*,*,上一段-ガ行,仮定形-融合 +1052,動詞,非自立可能,*,*,上一段-ガ行,命令形 +1053,動詞,非自立可能,*,*,上一段-ガ行,意志推量形 +1054,動詞,非自立可能,*,*,上一段-ガ行,未然形-一般 +1055,動詞,非自立可能,*,*,上一段-ガ行,終止形-一般 +1056,動詞,非自立可能,*,*,上一段-ガ行,終止形-撥音便 +1057,動詞,非自立可能,*,*,上一段-ガ行,連体形-一般 +1058,動詞,非自立可能,*,*,上一段-ガ行,連体形-撥音便 +1059,動詞,非自立可能,*,*,上一段-ガ行,連用形-一般 +1060,動詞,非自立可能,*,*,上一段-マ行,仮定形-一般 +1061,動詞,非自立可能,*,*,上一段-マ行,仮定形-融合 +1062,動詞,非自立可能,*,*,上一段-マ行,命令形 +1063,動詞,非自立可能,*,*,上一段-マ行,意志推量形 +1064,動詞,非自立可能,*,*,上一段-マ行,未然形-一般 +1065,動詞,非自立可能,*,*,上一段-マ行,終止形-一般 +1066,動詞,非自立可能,*,*,上一段-マ行,終止形-撥音便 +1067,動詞,非自立可能,*,*,上一段-マ行,連体形-一般 +1068,動詞,非自立可能,*,*,上一段-マ行,連体形-撥音便 +1069,動詞,非自立可能,*,*,上一段-マ行,連用形-一般 +1070,動詞,非自立可能,*,*,下一段-ア行,仮定形-一般 +1071,動詞,非自立可能,*,*,下一段-ア行,仮定形-融合 +1072,動詞,非自立可能,*,*,下一段-ア行,命令形 +1073,動詞,非自立可能,*,*,下一段-ア行,意志推量形 +1074,動詞,非自立可能,*,*,下一段-ア行,未然形-一般 +1075,動詞,非自立可能,*,*,下一段-ア行,終止形-一般 +1076,動詞,非自立可能,*,*,下一段-ア行,終止形-撥音便 +1077,動詞,非自立可能,*,*,下一段-ア行,連体形-一般 +1078,動詞,非自立可能,*,*,下一段-ア行,連体形-撥音便 +1079,動詞,非自立可能,*,*,下一段-ア行,連用形-一般 +1080,動詞,非自立可能,*,*,下一段-カ行,仮定形-一般 +1081,動詞,非自立可能,*,*,下一段-カ行,仮定形-融合 +1082,動詞,非自立可能,*,*,下一段-カ行,命令形 +1083,動詞,非自立可能,*,*,下一段-カ行,意志推量形 +1084,動詞,非自立可能,*,*,下一段-カ行,未然形-一般 +1085,動詞,非自立可能,*,*,下一段-カ行,終止形-一般 +1086,動詞,非自立可能,*,*,下一段-カ行,終止形-促音便 +1087,動詞,非自立可能,*,*,下一段-カ行,終止形-撥音便 +1088,動詞,非自立可能,*,*,下一段-カ行,連体形-一般 +1089,動詞,非自立可能,*,*,下一段-カ行,連体形-撥音便 +1090,動詞,非自立可能,*,*,下一段-カ行,連用形-一般 +1091,動詞,非自立可能,*,*,下一段-ガ行,仮定形-一般 +1092,動詞,非自立可能,*,*,下一段-ガ行,仮定形-融合 +1093,動詞,非自立可能,*,*,下一段-ガ行,命令形 +1094,動詞,非自立可能,*,*,下一段-ガ行,意志推量形 +1095,動詞,非自立可能,*,*,下一段-ガ行,未然形-一般 +1096,動詞,非自立可能,*,*,下一段-ガ行,終止形-一般 +1097,動詞,非自立可能,*,*,下一段-ガ行,終止形-撥音便 +1098,動詞,非自立可能,*,*,下一段-ガ行,連体形-一般 +1099,動詞,非自立可能,*,*,下一段-ガ行,連体形-撥音便 +1100,動詞,非自立可能,*,*,下一段-ガ行,連用形-一般 +1101,動詞,非自立可能,*,*,下一段-サ行,仮定形-一般 +1102,動詞,非自立可能,*,*,下一段-サ行,仮定形-融合 +1103,動詞,非自立可能,*,*,下一段-サ行,命令形 +1104,動詞,非自立可能,*,*,下一段-サ行,意志推量形 +1105,動詞,非自立可能,*,*,下一段-サ行,未然形-一般 +1106,動詞,非自立可能,*,*,下一段-サ行,終止形-一般 +1107,動詞,非自立可能,*,*,下一段-サ行,終止形-撥音便 +1108,動詞,非自立可能,*,*,下一段-サ行,連体形-一般 +1109,動詞,非自立可能,*,*,下一段-サ行,連体形-撥音便 +1110,動詞,非自立可能,*,*,下一段-サ行,連用形-一般 +1111,動詞,非自立可能,*,*,下一段-タ行,仮定形-一般 +1112,動詞,非自立可能,*,*,下一段-タ行,仮定形-融合 +1113,動詞,非自立可能,*,*,下一段-タ行,命令形 +1114,動詞,非自立可能,*,*,下一段-タ行,意志推量形 +1115,動詞,非自立可能,*,*,下一段-タ行,未然形-一般 +1116,動詞,非自立可能,*,*,下一段-タ行,終止形-一般 +1117,動詞,非自立可能,*,*,下一段-タ行,終止形-撥音便 +1118,動詞,非自立可能,*,*,下一段-タ行,連体形-一般 +1119,動詞,非自立可能,*,*,下一段-タ行,連体形-撥音便 +1120,動詞,非自立可能,*,*,下一段-タ行,連用形-一般 +1121,動詞,非自立可能,*,*,下一段-マ行,仮定形-一般 +1122,動詞,非自立可能,*,*,下一段-マ行,仮定形-融合 +1123,動詞,非自立可能,*,*,下一段-マ行,命令形 +1124,動詞,非自立可能,*,*,下一段-マ行,意志推量形 +1125,動詞,非自立可能,*,*,下一段-マ行,未然形-一般 +1126,動詞,非自立可能,*,*,下一段-マ行,終止形-一般 +1127,動詞,非自立可能,*,*,下一段-マ行,終止形-撥音便 +1128,動詞,非自立可能,*,*,下一段-マ行,連体形-一般 +1129,動詞,非自立可能,*,*,下一段-マ行,連体形-撥音便 +1130,動詞,非自立可能,*,*,下一段-マ行,連用形-一般 +1131,動詞,非自立可能,*,*,下一段-ラ行,仮定形-一般 +1132,動詞,非自立可能,*,*,下一段-ラ行,仮定形-融合 +1133,動詞,非自立可能,*,*,下一段-ラ行,命令形 +1134,動詞,非自立可能,*,*,下一段-ラ行,意志推量形 +1135,動詞,非自立可能,*,*,下一段-ラ行,未然形-一般 +1136,動詞,非自立可能,*,*,下一段-ラ行,未然形-撥音便 +1137,動詞,非自立可能,*,*,下一段-ラ行,終止形-一般 +1138,動詞,非自立可能,*,*,下一段-ラ行,終止形-促音便 +1139,動詞,非自立可能,*,*,下一段-ラ行,終止形-撥音便 +1140,動詞,非自立可能,*,*,下一段-ラ行,連体形-一般 +1141,動詞,非自立可能,*,*,下一段-ラ行,連体形-撥音便 +1142,動詞,非自立可能,*,*,下一段-ラ行,連用形-一般 +1143,動詞,非自立可能,*,*,下一段-ラ行,連用形-撥音便 +1144,動詞,非自立可能,*,*,五段-カ行,仮定形-一般 +1145,動詞,非自立可能,*,*,五段-カ行,仮定形-融合 +1146,動詞,非自立可能,*,*,五段-カ行,命令形 +1147,動詞,非自立可能,*,*,五段-カ行,意志推量形 +1148,動詞,非自立可能,*,*,五段-カ行,未然形-一般 +1149,動詞,非自立可能,*,*,五段-カ行,終止形-一般 +1150,動詞,非自立可能,*,*,五段-カ行,連体形-一般 +1151,動詞,非自立可能,*,*,五段-カ行,連用形-イ音便 +1152,動詞,非自立可能,*,*,五段-カ行,連用形-一般 +1153,動詞,非自立可能,*,*,五段-カ行,連用形-促音便 +1154,動詞,非自立可能,*,*,五段-カ行,連用形-省略 +1155,動詞,非自立可能,*,*,五段-カ行,連用形-融合 +1156,動詞,非自立可能,*,*,五段-サ行,仮定形-一般 +1157,動詞,非自立可能,*,*,五段-サ行,仮定形-融合 +1158,動詞,非自立可能,*,*,五段-サ行,命令形 +1159,動詞,非自立可能,*,*,五段-サ行,意志推量形 +1160,動詞,非自立可能,*,*,五段-サ行,未然形-一般 +1161,動詞,非自立可能,*,*,五段-サ行,終止形-一般 +1162,動詞,非自立可能,*,*,五段-サ行,連体形-一般 +1163,動詞,非自立可能,*,*,五段-サ行,連用形-一般 +1164,動詞,非自立可能,*,*,五段-サ行,連用形-融合 +1165,動詞,非自立可能,*,*,五段-マ行,仮定形-一般 +1166,動詞,非自立可能,*,*,五段-マ行,仮定形-融合 +1167,動詞,非自立可能,*,*,五段-マ行,命令形 +1168,動詞,非自立可能,*,*,五段-マ行,意志推量形 +1169,動詞,非自立可能,*,*,五段-マ行,未然形-一般 +1170,動詞,非自立可能,*,*,五段-マ行,終止形-一般 +1171,動詞,非自立可能,*,*,五段-マ行,連体形-一般 +1172,動詞,非自立可能,*,*,五段-マ行,連用形-一般 +1173,動詞,非自立可能,*,*,五段-マ行,連用形-撥音便 +1174,動詞,非自立可能,*,*,五段-マ行,連用形-融合 +1175,動詞,非自立可能,*,*,五段-ラ行,仮定形-一般 +1176,動詞,非自立可能,*,*,五段-ラ行,仮定形-融合 +1177,動詞,非自立可能,*,*,五段-ラ行,命令形 +1178,動詞,非自立可能,*,*,五段-ラ行,意志推量形 +1179,動詞,非自立可能,*,*,五段-ラ行,未然形-一般 +1180,動詞,非自立可能,*,*,五段-ラ行,未然形-撥音便 +1181,動詞,非自立可能,*,*,五段-ラ行,終止形-一般 +1182,動詞,非自立可能,*,*,五段-ラ行,終止形-撥音便 +1183,動詞,非自立可能,*,*,五段-ラ行,終止形-融合 +1184,動詞,非自立可能,*,*,五段-ラ行,連体形-一般 +1185,動詞,非自立可能,*,*,五段-ラ行,連体形-撥音便 +1186,動詞,非自立可能,*,*,五段-ラ行,連体形-省略 +1187,動詞,非自立可能,*,*,五段-ラ行,連用形-イ音便 +1188,動詞,非自立可能,*,*,五段-ラ行,連用形-一般 +1189,動詞,非自立可能,*,*,五段-ラ行,連用形-促音便 +1190,動詞,非自立可能,*,*,五段-ラ行,連用形-撥音便 +1191,動詞,非自立可能,*,*,五段-ラ行,連用形-省略 +1192,動詞,非自立可能,*,*,五段-ラ行,連用形-融合 +1193,動詞,非自立可能,*,*,五段-ワア行,仮定形-一般 +1194,動詞,非自立可能,*,*,五段-ワア行,命令形 +1195,動詞,非自立可能,*,*,五段-ワア行,意志推量形 +1196,動詞,非自立可能,*,*,五段-ワア行,未然形-一般 +1197,動詞,非自立可能,*,*,五段-ワア行,終止形-一般 +1198,動詞,非自立可能,*,*,五段-ワア行,連体形-一般 +1199,動詞,非自立可能,*,*,五段-ワア行,連用形-ウ音便 +1200,動詞,非自立可能,*,*,五段-ワア行,連用形-一般 +1201,動詞,非自立可能,*,*,五段-ワア行,連用形-促音便 +1202,動詞,非自立可能,*,*,文語カ行変格,命令形 +1203,動詞,非自立可能,*,*,文語カ行変格,已然形-一般 +1204,動詞,非自立可能,*,*,文語カ行変格,未然形-一般 +1205,動詞,非自立可能,*,*,文語カ行変格,終止形-一般 +1206,動詞,非自立可能,*,*,文語カ行変格,連体形-一般 +1207,動詞,非自立可能,*,*,文語カ行変格,連用形-一般 +1208,動詞,非自立可能,*,*,文語サ行変格,ク語法 +1209,動詞,非自立可能,*,*,文語サ行変格,命令形 +1210,動詞,非自立可能,*,*,文語サ行変格,已然形-一般 +1211,動詞,非自立可能,*,*,文語サ行変格,未然形-一般 +1212,動詞,非自立可能,*,*,文語サ行変格,終止形-一般 +1213,動詞,非自立可能,*,*,文語サ行変格,連体形-一般 +1214,動詞,非自立可能,*,*,文語サ行変格,連用形-一般 +1215,動詞,非自立可能,*,*,文語ラ行変格,命令形 +1216,動詞,非自立可能,*,*,文語ラ行変格,已然形-一般 +1217,動詞,非自立可能,*,*,文語ラ行変格,意志推量形 +1218,動詞,非自立可能,*,*,文語ラ行変格,未然形-一般 +1219,動詞,非自立可能,*,*,文語ラ行変格,終止形-一般 +1220,動詞,非自立可能,*,*,文語ラ行変格,終止形-撥音便 +1221,動詞,非自立可能,*,*,文語ラ行変格,連体形-一般 +1222,動詞,非自立可能,*,*,文語ラ行変格,連体形-撥音便 +1223,動詞,非自立可能,*,*,文語ラ行変格,連用形-一般 +1224,動詞,非自立可能,*,*,文語ラ行変格,連用形-促音便 +1225,動詞,非自立可能,*,*,文語上一段-マ行,命令形 +1226,動詞,非自立可能,*,*,文語上一段-マ行,已然形-一般 +1227,動詞,非自立可能,*,*,文語上一段-マ行,未然形-一般 +1228,動詞,非自立可能,*,*,文語上一段-マ行,終止形-一般 +1229,動詞,非自立可能,*,*,文語上一段-マ行,連体形-一般 +1230,動詞,非自立可能,*,*,文語上一段-マ行,連用形-一般 +1231,動詞,非自立可能,*,*,文語上一段-ワ行,命令形 +1232,動詞,非自立可能,*,*,文語上一段-ワ行,已然形-一般 +1233,動詞,非自立可能,*,*,文語上一段-ワ行,未然形-一般 +1234,動詞,非自立可能,*,*,文語上一段-ワ行,終止形-一般 +1235,動詞,非自立可能,*,*,文語上一段-ワ行,連体形-一般 +1236,動詞,非自立可能,*,*,文語上一段-ワ行,連用形-一般 +1237,動詞,非自立可能,*,*,文語下二段-ア行,命令形 +1238,動詞,非自立可能,*,*,文語下二段-ア行,已然形-一般 +1239,動詞,非自立可能,*,*,文語下二段-ア行,未然形-一般 +1240,動詞,非自立可能,*,*,文語下二段-ア行,終止形-一般 +1241,動詞,非自立可能,*,*,文語下二段-ア行,連体形-一般 +1242,動詞,非自立可能,*,*,文語下二段-ア行,連用形-一般 +1243,動詞,非自立可能,*,*,文語下二段-カ行,命令形 +1244,動詞,非自立可能,*,*,文語下二段-カ行,已然形-一般 +1245,動詞,非自立可能,*,*,文語下二段-カ行,未然形-一般 +1246,動詞,非自立可能,*,*,文語下二段-カ行,終止形-一般 +1247,動詞,非自立可能,*,*,文語下二段-カ行,連体形-一般 +1248,動詞,非自立可能,*,*,文語下二段-カ行,連用形-一般 +1249,動詞,非自立可能,*,*,文語下二段-ガ行,命令形 +1250,動詞,非自立可能,*,*,文語下二段-ガ行,已然形-一般 +1251,動詞,非自立可能,*,*,文語下二段-ガ行,未然形-一般 +1252,動詞,非自立可能,*,*,文語下二段-ガ行,終止形-一般 +1253,動詞,非自立可能,*,*,文語下二段-ガ行,連体形-一般 +1254,動詞,非自立可能,*,*,文語下二段-ガ行,連用形-一般 +1255,動詞,非自立可能,*,*,文語下二段-サ行,命令形 +1256,動詞,非自立可能,*,*,文語下二段-サ行,已然形-一般 +1257,動詞,非自立可能,*,*,文語下二段-サ行,未然形-一般 +1258,動詞,非自立可能,*,*,文語下二段-サ行,終止形-一般 +1259,動詞,非自立可能,*,*,文語下二段-サ行,連体形-一般 +1260,動詞,非自立可能,*,*,文語下二段-サ行,連用形-一般 +1261,動詞,非自立可能,*,*,文語下二段-タ行,命令形 +1262,動詞,非自立可能,*,*,文語下二段-タ行,已然形-一般 +1263,動詞,非自立可能,*,*,文語下二段-タ行,未然形-一般 +1264,動詞,非自立可能,*,*,文語下二段-タ行,終止形-一般 +1265,動詞,非自立可能,*,*,文語下二段-タ行,連体形-一般 +1266,動詞,非自立可能,*,*,文語下二段-タ行,連用形-一般 +1267,動詞,非自立可能,*,*,文語下二段-ハ行,命令形 +1268,動詞,非自立可能,*,*,文語下二段-ハ行,已然形-一般 +1269,動詞,非自立可能,*,*,文語下二段-ハ行,未然形-一般 +1270,動詞,非自立可能,*,*,文語下二段-ハ行,終止形-一般 +1271,動詞,非自立可能,*,*,文語下二段-ハ行,連体形-一般 +1272,動詞,非自立可能,*,*,文語下二段-ハ行,連用形-一般 +1273,動詞,非自立可能,*,*,文語下二段-バ行,命令形 +1274,動詞,非自立可能,*,*,文語下二段-バ行,已然形-一般 +1275,動詞,非自立可能,*,*,文語下二段-バ行,未然形-一般 +1276,動詞,非自立可能,*,*,文語下二段-バ行,終止形-一般 +1277,動詞,非自立可能,*,*,文語下二段-バ行,連体形-一般 +1278,動詞,非自立可能,*,*,文語下二段-バ行,連用形-一般 +1279,動詞,非自立可能,*,*,文語下二段-マ行,命令形 +1280,動詞,非自立可能,*,*,文語下二段-マ行,已然形-一般 +1281,動詞,非自立可能,*,*,文語下二段-マ行,未然形-一般 +1282,動詞,非自立可能,*,*,文語下二段-マ行,終止形-一般 +1283,動詞,非自立可能,*,*,文語下二段-マ行,連体形-一般 +1284,動詞,非自立可能,*,*,文語下二段-マ行,連用形-一般 +1285,動詞,非自立可能,*,*,文語下二段-ヤ行,命令形 +1286,動詞,非自立可能,*,*,文語下二段-ヤ行,已然形-一般 +1287,動詞,非自立可能,*,*,文語下二段-ヤ行,未然形-一般 +1288,動詞,非自立可能,*,*,文語下二段-ヤ行,終止形-一般 +1289,動詞,非自立可能,*,*,文語下二段-ヤ行,連体形-一般 +1290,動詞,非自立可能,*,*,文語下二段-ヤ行,連用形-一般 +1291,動詞,非自立可能,*,*,文語下二段-ラ行,命令形 +1292,動詞,非自立可能,*,*,文語下二段-ラ行,已然形-一般 +1293,動詞,非自立可能,*,*,文語下二段-ラ行,未然形-一般 +1294,動詞,非自立可能,*,*,文語下二段-ラ行,終止形-一般 +1295,動詞,非自立可能,*,*,文語下二段-ラ行,連体形-一般 +1296,動詞,非自立可能,*,*,文語下二段-ラ行,連用形-一般 +1297,動詞,非自立可能,*,*,文語四段-カ行,命令形 +1298,動詞,非自立可能,*,*,文語四段-カ行,已然形-一般 +1299,動詞,非自立可能,*,*,文語四段-カ行,意志推量形 +1300,動詞,非自立可能,*,*,文語四段-カ行,未然形-一般 +1301,動詞,非自立可能,*,*,文語四段-カ行,終止形-一般 +1302,動詞,非自立可能,*,*,文語四段-カ行,連体形-一般 +1303,動詞,非自立可能,*,*,文語四段-カ行,連用形-イ音便 +1304,動詞,非自立可能,*,*,文語四段-カ行,連用形-一般 +1305,動詞,非自立可能,*,*,文語四段-サ行,命令形 +1306,動詞,非自立可能,*,*,文語四段-サ行,已然形-一般 +1307,動詞,非自立可能,*,*,文語四段-サ行,意志推量形 +1308,動詞,非自立可能,*,*,文語四段-サ行,未然形-一般 +1309,動詞,非自立可能,*,*,文語四段-サ行,終止形-一般 +1310,動詞,非自立可能,*,*,文語四段-サ行,連体形-一般 +1311,動詞,非自立可能,*,*,文語四段-サ行,連用形-イ音便 +1312,動詞,非自立可能,*,*,文語四段-サ行,連用形-キ接続 +1313,動詞,非自立可能,*,*,文語四段-サ行,連用形-一般 +1314,動詞,非自立可能,*,*,文語四段-ハ行,ク語法 +1315,動詞,非自立可能,*,*,文語四段-ハ行,命令形 +1316,動詞,非自立可能,*,*,文語四段-ハ行,已然形-一般 +1317,動詞,非自立可能,*,*,文語四段-ハ行,意志推量形 +1318,動詞,非自立可能,*,*,文語四段-ハ行,未然形-一般 +1319,動詞,非自立可能,*,*,文語四段-ハ行,終止形-ウ音便 +1320,動詞,非自立可能,*,*,文語四段-ハ行,終止形-一般 +1321,動詞,非自立可能,*,*,文語四段-ハ行,連体形-ウ音便 +1322,動詞,非自立可能,*,*,文語四段-ハ行,連体形-一般 +1323,動詞,非自立可能,*,*,文語四段-ハ行,連用形-イ音便 +1324,動詞,非自立可能,*,*,文語四段-ハ行,連用形-ウ音便 +1325,動詞,非自立可能,*,*,文語四段-ハ行,連用形-一般 +1326,動詞,非自立可能,*,*,文語四段-ハ行,連用形-促音便 +1327,動詞,非自立可能,*,*,文語四段-マ行,命令形 +1328,動詞,非自立可能,*,*,文語四段-マ行,已然形-一般 +1329,動詞,非自立可能,*,*,文語四段-マ行,意志推量形 +1330,動詞,非自立可能,*,*,文語四段-マ行,未然形-一般 +1331,動詞,非自立可能,*,*,文語四段-マ行,終止形-一般 +1332,動詞,非自立可能,*,*,文語四段-マ行,連体形-一般 +1333,動詞,非自立可能,*,*,文語四段-マ行,連用形-ウ音便 +1334,動詞,非自立可能,*,*,文語四段-マ行,連用形-一般 +1335,動詞,非自立可能,*,*,文語四段-マ行,連用形-撥音便 +1336,動詞,非自立可能,*,*,文語四段-ラ行,命令形 +1337,動詞,非自立可能,*,*,文語四段-ラ行,已然形-一般 +1338,動詞,非自立可能,*,*,文語四段-ラ行,意志推量形 +1339,動詞,非自立可能,*,*,文語四段-ラ行,未然形-一般 +1340,動詞,非自立可能,*,*,文語四段-ラ行,終止形-一般 +1341,動詞,非自立可能,*,*,文語四段-ラ行,連体形-一般 +1342,動詞,非自立可能,*,*,文語四段-ラ行,連用形-一般 +1343,動詞,非自立可能,*,*,文語四段-ラ行,連用形-促音便 +1344,名詞,助動詞語幹,*,*,*,* +1345,名詞,固有名詞,一般,*,*,* +1346,名詞,固有名詞,人名,一般,*,* +1347,名詞,固有名詞,人名,名,*,* +1348,名詞,固有名詞,人名,姓,*,* +1349,名詞,固有名詞,地名,一般,*,* +1350,名詞,固有名詞,地名,国,*,* +1351,名詞,数詞,*,*,*,* +1352,名詞,普通名詞,サ変可能,*,*,* +1353,名詞,普通名詞,サ変形状詞可能,*,*,* +1354,名詞,普通名詞,一般,*,*,* +1355,名詞,普通名詞,副詞可能,*,*,* +1356,名詞,普通名詞,助数詞可能,*,*,* +1357,名詞,普通名詞,形状詞可能,*,*,* +1358,形容詞,一般,*,*,形容詞,仮定形-一般 +1359,形容詞,一般,*,*,形容詞,仮定形-融合 +1360,形容詞,一般,*,*,形容詞,意志推量形 +1361,形容詞,一般,*,*,形容詞,終止形-一般 +1362,形容詞,一般,*,*,形容詞,終止形-促音便 +1363,形容詞,一般,*,*,形容詞,語幹-サ +1364,形容詞,一般,*,*,形容詞,語幹-一般 +1365,形容詞,一般,*,*,形容詞,連体形-一般 +1366,形容詞,一般,*,*,形容詞,連用形-ウ音便 +1367,形容詞,一般,*,*,形容詞,連用形-一般 +1368,形容詞,一般,*,*,形容詞,連用形-促音便 +1369,形容詞,一般,*,*,形容詞,連用形-省略 +1370,形容詞,一般,*,*,文語形容詞-ク,命令形 +1371,形容詞,一般,*,*,文語形容詞-ク,已然形-一般 +1372,形容詞,一般,*,*,文語形容詞-ク,已然形-補助 +1373,形容詞,一般,*,*,文語形容詞-ク,意志推量形 +1374,形容詞,一般,*,*,文語形容詞-ク,未然形-一般 +1375,形容詞,一般,*,*,文語形容詞-ク,未然形-補助 +1376,形容詞,一般,*,*,文語形容詞-ク,終止形-一般 +1377,形容詞,一般,*,*,文語形容詞-ク,終止形-補助 +1378,形容詞,一般,*,*,文語形容詞-ク,連体形-イ音便 +1379,形容詞,一般,*,*,文語形容詞-ク,連体形-一般 +1380,形容詞,一般,*,*,文語形容詞-ク,連体形-撥音便 +1381,形容詞,一般,*,*,文語形容詞-ク,連体形-補助 +1382,形容詞,一般,*,*,文語形容詞-ク,連用形-ウ音便 +1383,形容詞,一般,*,*,文語形容詞-ク,連用形-一般 +1384,形容詞,一般,*,*,文語形容詞-ク,連用形-補助 +1385,形容詞,一般,*,*,文語形容詞-シク,命令形 +1386,形容詞,一般,*,*,文語形容詞-シク,已然形-一般 +1387,形容詞,一般,*,*,文語形容詞-シク,意志推量形 +1388,形容詞,一般,*,*,文語形容詞-シク,未然形-一般 +1389,形容詞,一般,*,*,文語形容詞-シク,未然形-補助 +1390,形容詞,一般,*,*,文語形容詞-シク,終止形-一般 +1391,形容詞,一般,*,*,文語形容詞-シク,語幹-一般 +1392,形容詞,一般,*,*,文語形容詞-シク,連体形-イ音便 +1393,形容詞,一般,*,*,文語形容詞-シク,連体形-一般 +1394,形容詞,一般,*,*,文語形容詞-シク,連体形-撥音便 +1395,形容詞,一般,*,*,文語形容詞-シク,連体形-補助 +1396,形容詞,一般,*,*,文語形容詞-シク,連用形-ウ音便 +1397,形容詞,一般,*,*,文語形容詞-シク,連用形-一般 +1398,形容詞,一般,*,*,文語形容詞-シク,連用形-補助 +1399,形容詞,非自立可能,*,*,形容詞,仮定形-一般 +1400,形容詞,非自立可能,*,*,形容詞,仮定形-融合 +1401,形容詞,非自立可能,*,*,形容詞,意志推量形 +1402,形容詞,非自立可能,*,*,形容詞,終止形-一般 +1403,形容詞,非自立可能,*,*,形容詞,終止形-促音便 +1404,形容詞,非自立可能,*,*,形容詞,語幹-サ +1405,形容詞,非自立可能,*,*,形容詞,語幹-一般 +1406,形容詞,非自立可能,*,*,形容詞,連体形-一般 +1407,形容詞,非自立可能,*,*,形容詞,連用形-ウ音便 +1408,形容詞,非自立可能,*,*,形容詞,連用形-一般 +1409,形容詞,非自立可能,*,*,形容詞,連用形-促音便 +1410,形容詞,非自立可能,*,*,形容詞,連用形-融合 +1411,形容詞,非自立可能,*,*,文語形容詞-ク,命令形 +1412,形容詞,非自立可能,*,*,文語形容詞-ク,已然形-一般 +1413,形容詞,非自立可能,*,*,文語形容詞-ク,已然形-補助 +1414,形容詞,非自立可能,*,*,文語形容詞-ク,意志推量形 +1415,形容詞,非自立可能,*,*,文語形容詞-ク,未然形-一般 +1416,形容詞,非自立可能,*,*,文語形容詞-ク,未然形-補助 +1417,形容詞,非自立可能,*,*,文語形容詞-ク,終止形-一般 +1418,形容詞,非自立可能,*,*,文語形容詞-ク,連体形-イ音便 +1419,形容詞,非自立可能,*,*,文語形容詞-ク,連体形-一般 +1420,形容詞,非自立可能,*,*,文語形容詞-ク,連体形-撥音便 +1421,形容詞,非自立可能,*,*,文語形容詞-ク,連体形-補助 +1422,形容詞,非自立可能,*,*,文語形容詞-ク,連用形-ウ音便 +1423,形容詞,非自立可能,*,*,文語形容詞-ク,連用形-一般 +1424,形容詞,非自立可能,*,*,文語形容詞-ク,連用形-補助 +1425,形状詞,タリ,*,*,*,* +1426,形状詞,一般,*,*,*,* +1427,形状詞,助動詞語幹,*,*,*,* +1428,感動詞,フィラー,*,*,*,* +1429,感動詞,一般,*,*,*,* +1430,接尾辞,動詞的,*,*,上一段-マ行,仮定形-一般 +1431,接尾辞,動詞的,*,*,上一段-マ行,仮定形-融合 +1432,接尾辞,動詞的,*,*,上一段-マ行,命令形 +1433,接尾辞,動詞的,*,*,上一段-マ行,意志推量形 +1434,接尾辞,動詞的,*,*,上一段-マ行,未然形-一般 +1435,接尾辞,動詞的,*,*,上一段-マ行,終止形-一般 +1436,接尾辞,動詞的,*,*,上一段-マ行,終止形-撥音便 +1437,接尾辞,動詞的,*,*,上一段-マ行,連体形-一般 +1438,接尾辞,動詞的,*,*,上一段-マ行,連体形-撥音便 +1439,接尾辞,動詞的,*,*,上一段-マ行,連用形-一般 +1440,接尾辞,動詞的,*,*,下一段-ナ行,仮定形-一般 +1441,接尾辞,動詞的,*,*,下一段-ナ行,仮定形-融合 +1442,接尾辞,動詞的,*,*,下一段-ナ行,命令形 +1443,接尾辞,動詞的,*,*,下一段-ナ行,意志推量形 +1444,接尾辞,動詞的,*,*,下一段-ナ行,未然形-一般 +1445,接尾辞,動詞的,*,*,下一段-ナ行,終止形-一般 +1446,接尾辞,動詞的,*,*,下一段-ナ行,終止形-撥音便 +1447,接尾辞,動詞的,*,*,下一段-ナ行,連体形-一般 +1448,接尾辞,動詞的,*,*,下一段-ナ行,連体形-撥音便 +1449,接尾辞,動詞的,*,*,下一段-ナ行,連用形-一般 +1450,接尾辞,動詞的,*,*,下一段-ラ行,仮定形-一般 +1451,接尾辞,動詞的,*,*,下一段-ラ行,仮定形-融合 +1452,接尾辞,動詞的,*,*,下一段-ラ行,命令形 +1453,接尾辞,動詞的,*,*,下一段-ラ行,意志推量形 +1454,接尾辞,動詞的,*,*,下一段-ラ行,未然形-一般 +1455,接尾辞,動詞的,*,*,下一段-ラ行,終止形-一般 +1456,接尾辞,動詞的,*,*,下一段-ラ行,終止形-撥音便 +1457,接尾辞,動詞的,*,*,下一段-ラ行,連体形-一般 +1458,接尾辞,動詞的,*,*,下一段-ラ行,連体形-撥音便 +1459,接尾辞,動詞的,*,*,下一段-ラ行,連用形-一般 +1460,接尾辞,動詞的,*,*,五段-カ行,仮定形-一般 +1461,接尾辞,動詞的,*,*,五段-カ行,仮定形-融合 +1462,接尾辞,動詞的,*,*,五段-カ行,命令形 +1463,接尾辞,動詞的,*,*,五段-カ行,意志推量形 +1464,接尾辞,動詞的,*,*,五段-カ行,未然形-一般 +1465,接尾辞,動詞的,*,*,五段-カ行,終止形-一般 +1466,接尾辞,動詞的,*,*,五段-カ行,連体形-一般 +1467,接尾辞,動詞的,*,*,五段-カ行,連用形-イ音便 +1468,接尾辞,動詞的,*,*,五段-カ行,連用形-一般 +1469,接尾辞,動詞的,*,*,五段-カ行,連用形-融合 +1470,接尾辞,動詞的,*,*,五段-サ行,仮定形-一般 +1471,接尾辞,動詞的,*,*,五段-サ行,仮定形-融合 +1472,接尾辞,動詞的,*,*,五段-サ行,命令形 +1473,接尾辞,動詞的,*,*,五段-サ行,意志推量形 +1474,接尾辞,動詞的,*,*,五段-サ行,未然形-一般 +1475,接尾辞,動詞的,*,*,五段-サ行,終止形-一般 +1476,接尾辞,動詞的,*,*,五段-サ行,連体形-一般 +1477,接尾辞,動詞的,*,*,五段-サ行,連用形-一般 +1478,接尾辞,動詞的,*,*,五段-サ行,連用形-融合 +1479,接尾辞,動詞的,*,*,五段-マ行,仮定形-一般 +1480,接尾辞,動詞的,*,*,五段-マ行,仮定形-融合 +1481,接尾辞,動詞的,*,*,五段-マ行,命令形 +1482,接尾辞,動詞的,*,*,五段-マ行,意志推量形 +1483,接尾辞,動詞的,*,*,五段-マ行,未然形-一般 +1484,接尾辞,動詞的,*,*,五段-マ行,終止形-一般 +1485,接尾辞,動詞的,*,*,五段-マ行,連体形-一般 +1486,接尾辞,動詞的,*,*,五段-マ行,連用形-一般 +1487,接尾辞,動詞的,*,*,五段-マ行,連用形-撥音便 +1488,接尾辞,動詞的,*,*,五段-マ行,連用形-融合 +1489,接尾辞,動詞的,*,*,五段-ラ行,仮定形-一般 +1490,接尾辞,動詞的,*,*,五段-ラ行,仮定形-融合 +1491,接尾辞,動詞的,*,*,五段-ラ行,命令形 +1492,接尾辞,動詞的,*,*,五段-ラ行,意志推量形 +1493,接尾辞,動詞的,*,*,五段-ラ行,未然形-一般 +1494,接尾辞,動詞的,*,*,五段-ラ行,未然形-撥音便 +1495,接尾辞,動詞的,*,*,五段-ラ行,終止形-一般 +1496,接尾辞,動詞的,*,*,五段-ラ行,終止形-撥音便 +1497,接尾辞,動詞的,*,*,五段-ラ行,連体形-一般 +1498,接尾辞,動詞的,*,*,五段-ラ行,連体形-撥音便 +1499,接尾辞,動詞的,*,*,五段-ラ行,連用形-一般 +1500,接尾辞,動詞的,*,*,五段-ラ行,連用形-促音便 +1501,接尾辞,動詞的,*,*,五段-ラ行,連用形-撥音便 +1502,接尾辞,動詞的,*,*,五段-ラ行,連用形-融合 +1503,接尾辞,名詞的,サ変可能,*,*,* +1504,接尾辞,名詞的,一般,*,*,* +1505,接尾辞,名詞的,副詞可能,*,*,* +1506,接尾辞,名詞的,助数詞,*,*,* +1507,接尾辞,形容詞的,*,*,形容詞,仮定形-一般 +1508,接尾辞,形容詞的,*,*,形容詞,仮定形-融合 +1509,接尾辞,形容詞的,*,*,形容詞,意志推量形 +1510,接尾辞,形容詞的,*,*,形容詞,終止形-一般 +1511,接尾辞,形容詞的,*,*,形容詞,語幹-一般 +1512,接尾辞,形容詞的,*,*,形容詞,連体形-一般 +1513,接尾辞,形容詞的,*,*,形容詞,連用形-ウ音便 +1514,接尾辞,形容詞的,*,*,形容詞,連用形-一般 +1515,接尾辞,形容詞的,*,*,形容詞,連用形-促音便 +1516,接尾辞,形容詞的,*,*,文語形容詞-ク,命令形 +1517,接尾辞,形容詞的,*,*,文語形容詞-ク,已然形-一般 +1518,接尾辞,形容詞的,*,*,文語形容詞-ク,已然形-補助 +1519,接尾辞,形容詞的,*,*,文語形容詞-ク,意志推量形 +1520,接尾辞,形容詞的,*,*,文語形容詞-ク,未然形-一般 +1521,接尾辞,形容詞的,*,*,文語形容詞-ク,未然形-補助 +1522,接尾辞,形容詞的,*,*,文語形容詞-ク,終止形-一般 +1523,接尾辞,形容詞的,*,*,文語形容詞-ク,連体形-イ音便 +1524,接尾辞,形容詞的,*,*,文語形容詞-ク,連体形-一般 +1525,接尾辞,形容詞的,*,*,文語形容詞-ク,連体形-撥音便 +1526,接尾辞,形容詞的,*,*,文語形容詞-ク,連体形-補助 +1527,接尾辞,形容詞的,*,*,文語形容詞-ク,連用形-ウ音便 +1528,接尾辞,形容詞的,*,*,文語形容詞-ク,連用形-一般 +1529,接尾辞,形容詞的,*,*,文語形容詞-ク,連用形-補助 +1530,接尾辞,形容詞的,*,*,文語形容詞-シク,命令形 +1531,接尾辞,形容詞的,*,*,文語形容詞-シク,已然形-一般 +1532,接尾辞,形容詞的,*,*,文語形容詞-シク,意志推量形 +1533,接尾辞,形容詞的,*,*,文語形容詞-シク,未然形-一般 +1534,接尾辞,形容詞的,*,*,文語形容詞-シク,未然形-補助 +1535,接尾辞,形容詞的,*,*,文語形容詞-シク,終止形-一般 +1536,接尾辞,形容詞的,*,*,文語形容詞-シク,語幹-一般 +1537,接尾辞,形容詞的,*,*,文語形容詞-シク,連体形-イ音便 +1538,接尾辞,形容詞的,*,*,文語形容詞-シク,連体形-一般 +1539,接尾辞,形容詞的,*,*,文語形容詞-シク,連体形-撥音便 +1540,接尾辞,形容詞的,*,*,文語形容詞-シク,連体形-補助 +1541,接尾辞,形容詞的,*,*,文語形容詞-シク,連用形-ウ音便 +1542,接尾辞,形容詞的,*,*,文語形容詞-シク,連用形-一般 +1543,接尾辞,形容詞的,*,*,文語形容詞-シク,連用形-補助 +1544,接尾辞,形状詞的,*,*,*,* +1545,接続詞,*,*,*,*,* +1546,接頭辞,*,*,*,*,* +1547,空白,*,*,*,*,* +1548,補助記号,一般,*,*,*,* +1549,補助記号,句点,*,*,*,* +1550,補助記号,括弧閉,*,*,*,* +1551,補助記号,括弧開,*,*,*,* +1552,補助記号,読点,*,*,*,* +1553,補助記号,AA,一般,*,*,* +1554,補助記号,AA,顔文字,*,*,* +1555,記号,一般,*,*,*,* +1556,記号,文字,*,*,*,* +1557,連体詞,*,*,*,*,* From 91ddc90e16decfe414cdecbd3edae9441d042bcc Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 2 Sep 2024 09:38:47 +0900 Subject: [PATCH 80/94] add getUserData for morpheme --- .../com/worksap/nlp/sudachi/Morpheme.java | 10 +++++++++ .../com/worksap/nlp/sudachi/MorphemeImpl.java | 6 +++++ .../worksap/nlp/sudachi/MorphemeImplTest.kt | 22 +++++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/src/main/java/com/worksap/nlp/sudachi/Morpheme.java b/src/main/java/com/worksap/nlp/sudachi/Morpheme.java index e9ac0d3c..37292e2f 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Morpheme.java +++ b/src/main/java/com/worksap/nlp/sudachi/Morpheme.java @@ -147,4 +147,14 @@ public interface Morpheme { * @return the array of synonym group IDs */ public int[] getSynonymGroupIds(); + + /** + * Returns the user custamized data of the morpheme. + * + * If the morpheme is in the system dictionary, is an oov, or user data is not + * set, returns a empty string. + * + * @return the user data String + */ + public String getUserData(); } diff --git a/src/main/java/com/worksap/nlp/sudachi/MorphemeImpl.java b/src/main/java/com/worksap/nlp/sudachi/MorphemeImpl.java index 93f49ee1..1bb64ee5 100644 --- a/src/main/java/com/worksap/nlp/sudachi/MorphemeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/MorphemeImpl.java @@ -99,6 +99,12 @@ public int[] getSynonymGroupIds() { return wi.getSynonymGroupIds(); } + @Override + public String getUserData() { + WordInfo wi = getWordInfo(); + return wi.getUserData(); + } + private LatticeNodeImpl node() { LatticeNodeImpl n = node; if (n == null) { diff --git a/src/test/java/com/worksap/nlp/sudachi/MorphemeImplTest.kt b/src/test/java/com/worksap/nlp/sudachi/MorphemeImplTest.kt index 5b5bf740..4b744239 100644 --- a/src/test/java/com/worksap/nlp/sudachi/MorphemeImplTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/MorphemeImplTest.kt @@ -18,6 +18,7 @@ package com.worksap.nlp.sudachi import kotlin.test.Test import kotlin.test.assertEquals +import kotlin.test.assertTrue class MorphemeImplTest { @Test @@ -30,4 +31,25 @@ class MorphemeImplTest { "MorphemeImpl{begin=0, end=1, surface=す, pos=4/名詞,普通名詞,一般,*,*,*, wid=(15,4)}", sudachi[0].toString()) } + + @Test + fun userdata() { + // system + val sdic = TestDictionary.user0() + val tokyo = sdic.create().tokenize("東京") + assertTrue(tokyo[0].getUserData().isEmpty()) + + // oov + val oovs = sdic.create().tokenize("すだち") + assertTrue(oovs[0].getUserData().isEmpty()) + + // user with data + val udic = TestDictionary.user1() + val sudachi = udic.create().tokenize("すだち") + assertEquals("徳島県産", sudachi[0].getUserData()) + + // user without data + val piraru = udic.create().tokenize("ぴらる") + assertTrue(piraru[0].getUserData().isEmpty()) + } } From c9e98db2704ed6919c9d021314f59c8b7695f1d3 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 2 Sep 2024 10:01:14 +0900 Subject: [PATCH 81/94] test empty pos id/parts column --- .../dictionary/build/RawLexiconReaderTest.kt | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt index 17c352f7..1d792d6b 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt @@ -158,6 +158,31 @@ class RawLexiconReaderTest { assertNull(reader.nextEntry()) } + @Test + fun posIdAndEmptyParts() { + val text = + """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure +東京都,6,8,5320,,,,,,,0,トウキョウト,,,,,""" + val posTable = POSTable() + posTable.getId(POS("a", "a", "a", "a", "a", "0")) + + val reader = RawLexiconReader(csvtext(text), posTable, false) + assertNotNull(reader.nextEntry()).let { e -> assertEquals(0, e.posId) } + assertNull(reader.nextEntry()) + } + + @Test + fun posPartsAndEmptyPosId() { + val text = + """Surface,LeftId,RightId,Cost,pos_id,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure +東京都,6,8,5320,0,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,""" + val posTable = POSTable() + + val reader = RawLexiconReader(csvtext(text), posTable, false) + assertNotNull(reader.nextEntry()).let { e -> assertEquals(0, e.posId) } + assertNull(reader.nextEntry()) + } + @Test fun failPosIdAndPartsNotMatch() { val text = @@ -186,6 +211,20 @@ class RawLexiconReaderTest { } } + @Test + fun failPosColumnEmpty() { + val text = + """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure +東京都,6,8,5320,,,,,,,,トウキョウト,,,,,""" + val posTable = POSTable() + posTable.getId(POS("a", "a", "a", "a", "a", "0")) + + assertFails { + val reader = RawLexiconReader(csvtext(text), posTable, false) + reader.nextEntry() + } + } + @Test fun failTooLongValue() { val oversizeWord = "a".repeat(StringPtr.MAX_LENGTH + 1) From 6698ed9d4c1a174c3bf9b7042b6b2e63e4d4fccc Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 2 Sep 2024 10:28:40 +0900 Subject: [PATCH 82/94] copy entry data for phantom entry --- .../sudachi/dictionary/build/RawLexicon.java | 12 +++-- .../dictionary/build/RawWordEntry.java | 44 ++++++++++++++++--- 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index fe7185e6..59dda59c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -207,15 +207,13 @@ private int addPhantomEntries(RawWordEntry entry, List list, Entry if (lookup.byHeadword(ref.getHeadword()) != null) { return 0; } - RawWordEntry copy = RawWordEntry.makeEmpty(); - copy.headword = ref.getHeadword(); - copy.reading = copy.headword; - copy.posId = entry.posId; + + RawWordEntry phantom = RawWordEntry.makePhantom(entry, ref.getHeadword()); RawWordEntry last = list.get(list.size() - 1); - copy.pointer = RawLexicon + phantom.pointer = RawLexicon .pointer((long) WordInfoList.wordId2offset(last.pointer) + last.computeExpectedSize()); - list.add(copy); - lookup.add(copy, isUser); + list.add(phantom); + lookup.add(phantom, isUser); nPhantomEntries += 1; return 1; } else { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java index 58a28041..c329611a 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java @@ -31,20 +31,20 @@ public class RawWordEntry implements EntryLookup.Entry { int pointer; // wordid, compressed offset of this entry in the lexicon.WordEntries String headword; + short leftId; + short rightId; + short cost; + short posId; String reading; WordRef normalizedForm; WordRef dictionaryForm; + String mode; List aUnitSplit; List bUnitSplit; List cUnitSplit; List wordStructure; Ints synonymGroups; String userData; - String mode; - short leftId; - short rightId; - short cost; - short posId; int sourceLine; String sourceName; @@ -126,23 +126,53 @@ public void publishStrings(StringStorage strings) { } } + /** + * Create empty RawWordEntry. + */ public static RawWordEntry makeEmpty() { RawWordEntry entry = new RawWordEntry(); entry.headword = ""; + entry.leftId = -1; + entry.rightId = -1; + entry.cost = Short.MAX_VALUE; + entry.posId = 0; entry.reading = ""; + //// null wordRef refers to self // entry.normalizedForm // entry.dictionaryForm + entry.mode = "A"; entry.aUnitSplit = new ArrayList<>(); entry.bUnitSplit = new ArrayList<>(); entry.cUnitSplit = new ArrayList<>(); entry.wordStructure = new ArrayList<>(); entry.synonymGroups = Ints.wrap(Ints.EMPTY_ARRAY); entry.userData = ""; - entry.mode = "A"; + return entry; + } + + /** + * Create phantom entry, that is referred for the normalized form of the base + * entry. + */ + public static RawWordEntry makePhantom(RawWordEntry base, String surface) { + RawWordEntry entry = new RawWordEntry(); + entry.headword = surface; + // phantom entry should not be used in the analysis entry.leftId = -1; entry.rightId = -1; entry.cost = Short.MAX_VALUE; - entry.posId = 0; + entry.posId = base.posId; + entry.reading = base.reading; + // phantom.normalized should be phantom itself + // entry.normalizedForm = base.normalizedForm; + entry.dictionaryForm = base.dictionaryForm; + entry.mode = base.mode; + entry.aUnitSplit = base.aUnitSplit; + entry.bUnitSplit = base.bUnitSplit; + entry.cUnitSplit = base.cUnitSplit; + entry.wordStructure = base.wordStructure; + entry.synonymGroups = base.synonymGroups; + entry.userData = base.userData; return entry; } } From 8f1007ac85d771da219edda37f7f679fc9465508 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 3 Sep 2024 11:13:31 +0900 Subject: [PATCH 83/94] try to satisfy sonarcloud --- .../dictionary/DictionaryGrammarPrinter.java | 2 +- .../sudachi/dictionary/DictionaryPrinter.java | 4 +- .../worksap/nlp/sudachi/dictionary/Ints.java | 6 ++ .../dictionary/build/ConnectionMatrix.java | 7 +- .../sudachi/dictionary/build/DicBuilder.java | 4 +- .../sudachi/dictionary/build/POSTable.java | 34 +++--- .../dictionary/build/RawLexiconReader.java | 101 +++++++++--------- .../dictionary/build/RawWordEntry.java | 3 +- .../dictionary/build/StringLayout.java | 2 +- .../sudachi/dictionary/build/POSTableTest.kt | 2 +- 10 files changed, 87 insertions(+), 78 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinter.java index 3abad3f7..98ba9c48 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryGrammarPrinter.java @@ -38,7 +38,7 @@ static void printUsage() { } static void printHeader(PrintStream output) { - List columnNames = Arrays.asList(POSTable.POSCSVReader.Column.values()).stream().map(c -> c.name()) + List columnNames = Arrays.asList(POSTable.POSCSVReader.Column.values()).stream().map(Enum::name) .collect(Collectors.toList()); output.println(String.join(",", columnNames)); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index 8a7fad4f..99f3ceab 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -47,13 +47,13 @@ public class DictionaryPrinter { public enum POSMode { PARTS, ID, BOTH; - public static POSMode DEFAULT = PARTS; + public static final POSMode DEFAULT = PARTS; } public enum WordRefMode { TRIPLE_PARTS, TRIPLE_ID; - public static WordRefMode DEFAULT = TRIPLE_PARTS; + public static final WordRefMode DEFAULT = TRIPLE_PARTS; } DictionaryPrinter(PrintStream output, BinaryDictionary dic, BinaryDictionary base) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java index a1c3f1fd..1d737231 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/Ints.java @@ -18,6 +18,7 @@ import java.nio.ByteBuffer; import java.util.Arrays; +import java.util.Objects; import java.util.StringJoiner; /** @@ -138,6 +139,11 @@ public boolean equals(Object other) { return true; } + @Override + public int hashCode() { + return Objects.hash(this.length, Arrays.copyOfRange(this.data, 0, this.length)); + } + /** * Make sure the internal buffer has enough capacity for the specified size. * This also increases length and they should be filled using {@code set} or diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java index 795fe708..6e10c66d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrix.java @@ -99,11 +99,8 @@ public long readEntries(InputStream data) throws IOException { long numLines = 0; - while (true) { - String line = reader.readLine(); - if (line == null) { - break; - } + String line; + while ((line = reader.readLine()) != null) { if (OPT_WHITESPACE.matcher(line).matches()) { continue; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java index 235a4503..b06f0e94 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/DicBuilder.java @@ -226,7 +226,7 @@ public System signature(String signature) { /** Read POS list from the csv file. */ public System posTable(String name, IOSupplier input, long size) throws IOException { - if (!pos.allowNewPos) { + if (!pos.isNewPosAllowed()) { throw new IllegalArgumentException("POS list already loaded (only single POS file is allowed)."); } @@ -235,7 +235,7 @@ public System posTable(String name, IOSupplier input, long size) th try (InputStream is = input.get()) { InputStream stream = new ProgressInputStream(is, size, progress); nRead = pos.readEntries(stream); - pos.allowNewPos = false; + pos.setAllowNewPos(false); } progress.endBlock(nRead, nanoTime()); return this; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java index 5bcc32dd..ea2f7b1d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java @@ -35,7 +35,7 @@ public class POSTable { private final List table = new ArrayList<>(); private final HashMap lookup = new HashMap<>(); - public boolean allowNewPos = true; + private boolean allowNewPos = true; // number of pos loaded from the system dictionary. private short builtin = 0; @@ -78,6 +78,14 @@ public int ownedLength() { return table.size() - builtin; } + public void setAllowNewPos(boolean value) { + this.allowNewPos = value; + } + + public boolean isNewPosAllowed() { + return this.allowNewPos; + } + /** * Add pos at the index `id` of the table. This may creates null entry in the * table. @@ -90,7 +98,7 @@ public int ownedLength() { */ private short addPosAt(POS pos, short id) { if (!allowNewPos) { - throw new IllegalArgumentException(String.format("new POS is not allowed", pos)); + throw new IllegalArgumentException("new POS is not allowed: " + pos); } if (id >= MAX_POS_NUMBER) { throw new IllegalArgumentException("id " + id + " exceeds the maximum POS number"); @@ -181,10 +189,9 @@ public int readEntries(InputStream data) throws IOException { /** * Data class for pos read from csv. */ - static class POSWithId { - public POS pos; - public short id = -1; - public int sourceLine; + private static class POSWithId { + public final POS pos; + public final short id; POSWithId(POS pos, short id) { this.pos = pos; @@ -192,7 +199,7 @@ static class POSWithId { } POSWithId(POS pos) { - this.pos = pos; + this(pos, (short) -1); } } @@ -205,9 +212,10 @@ public static class POSCSVReader { private CSVParser parser; private int[] columnMapping; private List cachedRow; - public boolean hasIdColumn = true; + private boolean hasIdColumn = true; - public Column[] PART_COLUMNS = { Column.POS1, Column.POS2, Column.POS3, Column.POS4, Column.POS5, Column.POS6 }; + public static final Column[] PART_COLUMNS = { Column.POS1, Column.POS2, Column.POS3, Column.POS4, Column.POS5, + Column.POS6 }; public enum Column { POS_ID(false), POS1(true), POS2(true), POS3(true), POS4(true), POS5(true), POS6(true); @@ -262,7 +270,7 @@ private void resolveColumnLayout() throws IOException { for (int colIdx = 0; colIdx < row.size(); colIdx++) { String elem = row.get(colIdx); parsed = Column.fromString(elem); - if (!remaining.contains(parsed)) { + if (parsed == null || !remaining.contains(parsed)) { throw new InputFileException(parser.getName(), 0, elem, new IllegalArgumentException("Invalid column name")); } @@ -314,7 +322,7 @@ private POSWithId convertRow(List data) { * * returned pos-id is -1 when pos-id column is missing. */ - POSWithId nextPos() throws IOException { + private POSWithId nextPos() throws IOException { List row = cachedRow; if (row == null) { row = parser.getNextRow(); @@ -324,9 +332,7 @@ POSWithId nextPos() throws IOException { if (row == null) { return null; } - POSWithId pos = convertRow(row); - pos.sourceLine = parser.getRowCount(); - return pos; + return convertRow(row); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index 69ca9797..4212eb5a 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -25,6 +25,7 @@ import java.util.List; import java.util.StringJoiner; import java.util.regex.Pattern; +import java.util.stream.Collectors; /** * Reader for the lexicon csv file. @@ -40,11 +41,25 @@ public enum Column { true), DICTIONARY_FORM(true), MODE(false), SPLIT_A(true), SPLIT_B(true), WORD_STRUCTURE( true), SYNONYM_GROUPS(false), SPLIT_C(false), USER_DATA(false), POS_ID(false); + private static final List POS_PARTS = Arrays.asList(POS1, POS2, POS3, POS4, POS5, POS6); private final boolean required; Column(boolean required) { this.required = required; } + + /** + * Parse string as Column, ignoring "_" and cases. + */ + private static Column fromString(String str) { + String processed = str.replace("_", ""); + for (Column col : Column.values()) { + if (col.name().replace("_", "").equalsIgnoreCase(processed)) { + return col; + } + } + return null; + } } private static final Pattern INTEGER_REGEX = Pattern.compile("^-?\\d+$"); @@ -94,41 +109,40 @@ private void resolveColumnLayout() throws IOException { Arrays.fill(mapping, -1); for (int fieldId = 0; fieldId < row.size(); ++fieldId) { - String field = row.get(fieldId).replace("_", ""); - boolean columnFound = false; - for (int colId = 0; colId < remaining.size(); ++colId) { - Column col = remaining.get(colId); - if (col.name().replace("_", "").equalsIgnoreCase(field)) { - mapping[col.ordinal()] = fieldId; - remaining.remove(colId); - columnFound = true; - break; - } - } - if (!columnFound) { + String field = row.get(fieldId); + Column parsed = Column.fromString(field); + if (parsed == null || !remaining.contains(parsed)) { throw new InputFileException(parser.getName(), 0, field, new IllegalArgumentException("Invalid column name")); } + mapping[parsed.ordinal()] = fieldId; + remaining.remove(remaining.indexOf(parsed)); } - for (Column column : remaining) { - if (column.required) { - StringJoiner joiner = new StringJoiner(", ", "required columns [", "] were not present in the header"); - remaining.stream().filter(c -> c.required).forEach(c -> joiner.add(c.name())); - throw new InputFileException(parser.getName(), 0, "", new IllegalArgumentException(joiner.toString())); - } + List missings = remaining.stream().filter(c -> c.required).collect(Collectors.toList()); + if (!missings.isEmpty()) { + StringJoiner joiner = new StringJoiner(", ", "required columns [", "] were not present in the header"); + missings.stream().forEach(c -> joiner.add(c.name())); + throw new InputFileException(parser.getName(), 0, "", new IllegalArgumentException(joiner.toString())); } - boolean posIdExists = mapping[Column.POS_ID.ordinal()] >= 0; - long numPosColumnsFound = Arrays - .asList(Column.POS1, Column.POS2, Column.POS3, Column.POS4, Column.POS5, Column.POS6).stream() - .filter(c -> mapping[c.ordinal()] >= 0).count(); + verifyPosColumns(); + } + + private void verifyPosColumns() { + if (mapping == null) { + return; + } + + long numPosColumnsFound = Column.POS_PARTS.stream().filter(c -> mapping[c.ordinal()] >= 0).count(); if (numPosColumnsFound != 0 && numPosColumnsFound != POS.DEPTH) { - throw new InputFileException(parser.getName(), 0, "POS", + throw new InputFileException(parser.getName(), 0, "POS_PARTS", new IllegalArgumentException("Pos1 ~ Pos6 columns must appear as a set.")); } - boolean posStrExists = numPosColumnsFound == POS.DEPTH; - if (!posIdExists && !posStrExists) { + + boolean posIdExists = mapping[Column.POS_ID.ordinal()] >= 0; + boolean posPartsExists = numPosColumnsFound == POS.DEPTH; + if (!posIdExists && !posPartsExists) { throw new InputFileException(parser.getName(), 0, "POS", new IllegalArgumentException("Both or either PosId column or Pos1~Pos6 columns are required.")); } @@ -245,46 +259,33 @@ private WordRef getWordRef(List data, Column column, WordRef.Parser refP /** parse POS columns. */ private short getPos(List data) { - boolean idColumnExists = false; - boolean strColumnExists = true; - if (!isLegacyColumnLayout()) { - idColumnExists = mapping[Column.POS_ID.ordinal()] >= 0; - // existance of POS1-6 is checked in column layout resolution - strColumnExists = mapping[Column.POS1.ordinal()] >= 0; - } - - short posId = -1; short posStrId = -1; - - if (strColumnExists && (!idColumnExists || !get(data, Column.POS1, false).isEmpty())) { - // if both id/parts exist, allow empty (-1) + if (!get(data, Column.POS1, false).isEmpty()) { POS pos = new POS( // comment for line break get(data, Column.POS1, true), get(data, Column.POS2, true), get(data, Column.POS3, true), get(data, Column.POS4, true), get(data, Column.POS5, true), get(data, Column.POS6, true)); posStrId = posTable.getId(pos); } - if (idColumnExists && (!strColumnExists || !get(data, Column.POS_ID, false).isEmpty())) { - // if both id/parts exist, allow empty (-1) + + short posId = -1; + if (!get(data, Column.POS_ID, false).isEmpty()) { posId = getShort(data, Column.POS_ID); if (posId >= posTable.size()) { - throw new InputFileException(parser.getName(), parser.getRowCount(), "POS", + throw new InputFileException(parser.getName(), parser.getRowCount(), "POS_ID", new IllegalArgumentException( String.format("POS for id %d is not present in the table.", posId))); } } - if (idColumnExists && strColumnExists) { - if (posId < 0 && posStrId < 0) { - throw new InputFileException(parser.getName(), parser.getRowCount(), "POS", - new IllegalArgumentException("Both PosId and Pos1-6 are empty.")); - } - if (posId >= 0 && posStrId >= 0 && posId != posStrId) { - throw new InputFileException(parser.getName(), parser.getRowCount(), "POS", - new IllegalArgumentException( - String.format("PosId (%d) and id from Pos1-6 (%d) does not match.", posId, posStrId))); - } + if (posId < 0 && posStrId < 0) { + throw new InputFileException(parser.getName(), parser.getRowCount(), "POS", + new IllegalArgumentException("Both PosId and Pos1-6 are missing.")); + } + if (posId >= 0 && posStrId >= 0 && posId != posStrId) { + throw new InputFileException(parser.getName(), parser.getRowCount(), "POS", new IllegalArgumentException( + String.format("PosId (%d) and id from Pos1-6 (%d) does not match.", posId, posStrId))); } return posId >= 0 ? posId : posStrId; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java index c329611a..7540c5aa 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java @@ -163,8 +163,7 @@ public static RawWordEntry makePhantom(RawWordEntry base, String surface) { entry.cost = Short.MAX_VALUE; entry.posId = base.posId; entry.reading = base.reading; - // phantom.normalized should be phantom itself - // entry.normalizedForm = base.normalizedForm; + // normalizedForm = null: refer to itself entry.dictionaryForm = base.dictionaryForm; entry.mode = base.mode; entry.aUnitSplit = base.aUnitSplit; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringLayout.java index e289af0a..5edfe4e4 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/StringLayout.java @@ -249,7 +249,7 @@ public boolean equals(Object obj) { if (this == obj) { return true; } - if (obj == null || !(obj instanceof FreeSpace)) { + if (!(obj instanceof FreeSpace)) { return false; } FreeSpace other = (FreeSpace) obj; diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/POSTableTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/POSTableTest.kt index 3cc2446d..01896d77 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/POSTableTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/POSTableTest.kt @@ -112,7 +112,7 @@ class POSTableTest { @Test fun inhibitNewPos() { val posTable = POSTable() - posTable.allowNewPos = false + posTable.setAllowNewPos(false) val newPos = POS("a", "a", "a", "a", "a", "a") assertFails { posTable.getId(newPos) } From b63e087e7ee9bfbffd60ccb3d7ae4807319eafc1 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 11 Sep 2024 10:01:23 +0900 Subject: [PATCH 84/94] add --output option to dict-printer --- .../nlp/sudachi/SudachiCommandLine.java | 6 ++--- .../sudachi/dictionary/DictionaryPrinter.java | 25 ++++++++++++++++--- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/SudachiCommandLine.java b/src/main/java/com/worksap/nlp/sudachi/SudachiCommandLine.java index 2912ad5f..3cdacb5d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/SudachiCommandLine.java +++ b/src/main/java/com/worksap/nlp/sudachi/SudachiCommandLine.java @@ -43,16 +43,16 @@ public class SudachiCommandLine { static Logger logger = Logger.getLogger(SudachiCommandLine.class.getName()); - static class FileOrStdoutPrintStream extends PrintStream { + public static class FileOrStdoutPrintStream extends PrintStream { private boolean isFile; - FileOrStdoutPrintStream() { + public FileOrStdoutPrintStream() { super(System.out, true); isFile = false; } - FileOrStdoutPrintStream(String fileName) throws FileNotFoundException, UnsupportedEncodingException { + public FileOrStdoutPrintStream(String fileName) throws FileNotFoundException, UnsupportedEncodingException { super(new FileOutputStream(fileName), false, "UTF-8"); isFile = true; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index 99f3ceab..2864401a 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -16,6 +16,7 @@ package com.worksap.nlp.sudachi.dictionary; +import com.worksap.nlp.sudachi.SudachiCommandLine.FileOrStdoutPrintStream; import com.worksap.nlp.sudachi.WordId; import com.worksap.nlp.sudachi.dictionary.build.Progress; import com.worksap.nlp.sudachi.dictionary.build.RawLexiconReader; @@ -44,12 +45,24 @@ public class DictionaryPrinter { private POSMode posMode = POSMode.DEFAULT; private WordRefMode wordRefMode = WordRefMode.DEFAULT; + /** + * POS print mode + * + * PARTS: print 6 pos parts (column: POS1-6). ID: print pos-id (column: POS_ID). + * BOTH: print both parts and id. + */ public enum POSMode { PARTS, ID, BOTH; public static final POSMode DEFAULT = PARTS; } + /** + * WordRef print mode + * + * TRIPLE_PARTS: print as (surface, pos1, .., pos6, reading) tuple. TRIPLE_ID: + * print as (surface, pos-id, reading) tuple. + */ public enum WordRefMode { TRIPLE_PARTS, TRIPLE_ID; @@ -100,7 +113,8 @@ void setProgress(Progress progress) { static void printUsage() { Console console = System.console(); - console.printf("usage: PrintDictionary [-s file] [--posMode mode] [--wordRefMode mode] file\n"); + console.printf("usage: PrintDictionary [-o file] [-s file] [--posMode mode] [--wordRefMode mode] file\n"); + console.printf("\t-o file\toutput file.\n"); console.printf("\t-s file\tsystem dictionary. required to print user dictionary.\n"); console.printf("\t--posMode [PARTS, ID, BOTH]\tprint specified POS column (default PARTS).\n"); console.printf( @@ -341,6 +355,7 @@ static String splitToString(int[] split) { * if IO */ public static void main(String[] args) throws IOException { + String outputPath = null; String systemDictPath = null; POSMode posMode = POSMode.PARTS; WordRefMode wordRefMode = WordRefMode.TRIPLE_PARTS; @@ -350,6 +365,8 @@ public static void main(String[] args) throws IOException { if (args[i].equals("-h")) { printUsage(); return; + } else if (args[i].equals("-o") && i + 1 < args.length) { + outputPath = args[++i]; } else if (args[i].equals("-s") && i + 1 < args.length) { systemDictPath = args[++i]; } else if (args[i].equals("--posMode") && i + 1 < args.length) { @@ -367,12 +384,14 @@ public static void main(String[] args) throws IOException { String dictPath = args[i]; BinaryDictionary systemDict = null; - try (BinaryDictionary dict = new BinaryDictionary(dictPath)) { + try (BinaryDictionary dict = new BinaryDictionary(dictPath); + PrintStream output = outputPath == null ? new FileOrStdoutPrintStream() + : new FileOrStdoutPrintStream(outputPath);) { if (systemDictPath != null) { systemDict = BinaryDictionary.loadSystem(systemDictPath); } - DictionaryPrinter printer = new DictionaryPrinter(System.out, dict, systemDict, posMode, wordRefMode); + DictionaryPrinter printer = new DictionaryPrinter(output, dict, systemDict, posMode, wordRefMode); printer.printDictionary(); } finally { if (systemDict != null) { From c9d2ff6d434fae89da9e1b948e79dbe3d5bf15ca Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 11 Sep 2024 10:01:51 +0900 Subject: [PATCH 85/94] add lexicon migration guide and script --- docs/migrate_legacy_dictionary_v1.md | 107 ++++++++++++++++++++++ scripts/migrate_legacy_user_lexicon_v1.sh | 39 ++++++++ 2 files changed, 146 insertions(+) create mode 100644 docs/migrate_legacy_dictionary_v1.md create mode 100755 scripts/migrate_legacy_user_lexicon_v1.sh diff --git a/docs/migrate_legacy_dictionary_v1.md b/docs/migrate_legacy_dictionary_v1.md new file mode 100644 index 00000000..389dfe24 --- /dev/null +++ b/docs/migrate_legacy_dictionary_v1.md @@ -0,0 +1,107 @@ +# 旧形式のユーザー辞書からの移行方法 (V1) + +旧形式(レガシー形式)の Sudachi バイナリ辞書は現在の Sudachi では利用できません。 +辞書ソースである lexicon CSV ファイルについては旧形式のものも利用可能ですが、記述方法が変更されています。 + +本文書では旧型式の Sudachi 辞書から新形式 (V1) に移行するための手順を記述します。 + +## 辞書ソースファイル (lexicon CSV) + +辞書ソースである lexicon CSV ファイルは、標準の記述方法が変更されました。 +現在は旧形式のものも利用可能ですが、新形式への移行を推奨します。 + +形式の詳細については [user_dict.md](./user_dict.md) を参照してください。 +本文書では移行に必要な部分のみを扱います。 + +### 移行のための差分の概要 + +旧形式から新形式への移行にあたっては、以下の編集が必要となります。 + +#### ヘッダー + +lexicon の 1 行目をヘッダー行とし、記述する項目の種類と順序を指定します。 +カラム名については大文字小文字および "\_" の有無は無視して処理されます。 + +旧形式に対応するヘッダー行は以下になります。 + +```csv +SURFACE,LEFT_ID,RIGHT_ID,COST,WRITING,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,MODE,SPLIT_A,SPLIT_B,WORD_STRUCTURE +``` + +#### 空項目 + +旧形式では項目の値が空の場合、"\*" を指定していましたが、新形式では空文字列とする必要があります。 +なお品詞項目の "\*" は空値とは異なるため空にはできません。 + +#### 語参照 + +辞書形や分割情報の項目では他の語への参照を記述する場合があります。 +新形式における語参照は、参照先の語の「見出し表記、品詞、読み」の組でのみ記述が可能です。 + +旧形式での行番号による参照は使用できません。 + +### 移行用スクリプト + +移行用スクリプト [`migrate_legacy_user_lexicon_v1.sh`](../scripts/migrate_legacy_user_lexicon_v1.sh) が利用できます。 + +実行には、参照するシステムバイナリ辞書を指定する必要があります。 +別途[配布ページ](http://sudachi.s3-website-ap-northeast-1.amazonaws.com/sudachidict/)から取得してください。 + +lexicon 内でシステム辞書内の語を参照している場合、以前と別のシステム辞書を用いると、参照先がずれる可能性があります。 +このスクリプトの使用においては、対象 lexicon ファイルの作成時に参照したバージョンのシステム辞書を指定してください。 +新形式へ変換した lexicon ファイルは任意のバージョンのシステム辞書と共に使用可能になります。 + +例: + +```bash +cd /path/to/sudachi +./scripts/migrate_legacy_user_lexicon_v1.sh old_lexicon.csv /path/to/system.dic > new_lexicon.csv +``` + +## バイナリ辞書 + +旧形式の Sudachi バイナリ辞書は現在の Sudachi では読み込むことができません。 +新形式のバイナリ辞書として再ビルドする必要があります。 + +### 1. lexicon CSV からの移行 + +対象のバイナリ辞書に対応する辞書ソースである lexicon CSV ファイルが存在する場合は、それを元に新形式のバイナリ辞書をビルドすることができます。 +上記の辞書ソースファイルの移行方法に従い、新形式の lexicon ファイルに変換したのち、新形式でのビルドを行ってください。 + +旧形式の lexicon ファイルからでもビルド可能ですが、非推奨です。 +旧形式の lexicon ファイル内でシステム辞書内の語を参照している場合、異なるシステム辞書を用いると、参照先がずれる可能性があります。 + +ユーザー辞書のビルドでは、参照するシステムバイナリ辞書を指定する必要があります。 +別途[配布ページ](http://sudachi.s3-website-ap-northeast-1.amazonaws.com/sudachidict/)から取得してください。 + +例: `sudachi/system.dic` を参照し、`user_lexicon.csv` からバイナリユーザー辞書 `new_user.dic` をビルドする + +```bash +unzip -d "./sudachi" "./build/distributions/sudachi-executable-1.0.0.zip" +java -Dfile.encoding=UTF-8 \ + -cp ./sudachi/sudachi-1.0.0.jar \ + com.worksap.nlp.sudachi.dictionary.UserDictionaryBuilder \ + -s ./sudachi/system.dic -o new_user.dic user_lexicon.csv +``` + +### 2. バイナリ辞書からの移行 + +バイナリ辞書のみが存在する場合、現在の Sudachi では移行ができません。 +過去バージョンの Sudachi にて、DictionaryPrinter を用いて辞書ソースファイルへの変換を行ってください。 +これは旧形式での出力となるため、さらに上記の辞書ソースファイルの移行も必要となります。 + +ユーザー辞書のプリントでは、参照するシステムバイナリ辞書を指定する必要があります。 +別途[配布ページ](http://sudachi.s3-website-ap-northeast-1.amazonaws.com/sudachidict/)から取得してください。 + +ユーザー辞書内でシステム辞書内の語を参照している場合、ビルド時と異なるシステム辞書を用いると、参照先がずれる可能性があります。 +対象バイナリ辞書のビルドの際に参照したシステム辞書を指定するようにしてください。 + +例: `sudachi/system.dic` を参照し、バイナリユーザー辞書 `user.dic` の語を `user_lexicon.csv` に出力する + +```bash +unzip -d "./sudachi" "./build/distributions/sudachi-executable-0.8.0.zip" +java -Dfile.encoding=UTF-8 \ + -cp ./sudachi/sudachi-0.8.0.jar \ + com.worksap.nlp.sudachi.dictionary.DictionaryPrinter \ + -s ./sudachi/system.dic -o user_lexicon.csv user.dic \ +``` diff --git a/scripts/migrate_legacy_user_lexicon_v1.sh b/scripts/migrate_legacy_user_lexicon_v1.sh new file mode 100755 index 00000000..708f9a19 --- /dev/null +++ b/scripts/migrate_legacy_user_lexicon_v1.sh @@ -0,0 +1,39 @@ +#!/bin/bash - +# convert legacy user lexicon csv file into V1 format +set -eux + +# Constants +DIR=$(dirname "$(readlink -f "$0")") +SUDACHI_VERSION=$(${DIR}/../gradlew properties --console=plain -q | grep "^version:" | awk '{printf $2}') + +# args +LEXICON_FILE=${1} +SYSTEM_DICT=${2} + +# Build Sudachi +${DIR}/../gradlew build -q + +BUILD_DIR="$DIR/../build/distributions" +JAR_DIR="$BUILD_DIR/sudachi" +if [ -e "$JAR_DIR" ]; then + rm -r "$JAR_DIR" +fi +unzip -q -d "$JAR_DIR" "$BUILD_DIR/sudachi-executable-$SUDACHI_VERSION.zip" + +# Build and Print +DATA_DIR=$JAR_DIR/dictdata +mkdir -p "$DATA_DIR" + +USER_DICT="${DATA_DIR}/migrating.dic" + +java -Dfile.encoding=UTF-8 \ + -cp "$JAR_DIR/sudachi-${SUDACHI_VERSION}.jar" \ + com.worksap.nlp.sudachi.dictionary.UserDictionaryBuilder \ + -o "$USER_DICT" -s "$SYSTEM_DICT" "$LEXICON_FILE" + +java -Dfile.encoding=UTF-8 \ + -cp "$JAR_DIR/sudachi-${SUDACHI_VERSION}.jar" \ + com.worksap.nlp.sudachi.dictionary.DictionaryPrinter \ + --posMode PARTS \ + --wordRefMode TRIPLE_PARTS \ + -s "$SYSTEM_DICT" "$USER_DICT" \ From 44570bdc2e072b429e868ca043adcc536eff96a7 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 11 Sep 2024 11:41:44 +0900 Subject: [PATCH 86/94] disallow lineno wordref in new dictionary format --- .../sudachi/dictionary/build/RawLexicon.java | 2 +- .../dictionary/build/RawLexiconReader.java | 6 +- .../dictionary/DictionaryBuilderTest.kt | 16 +++-- .../dictionary/build/RawLexiconReaderTest.kt | 71 ++++++++++--------- .../sudachi/dictionary/build/headers-all.csv | 6 +- .../dictionary/build/headers-minimum.csv | 2 +- .../nlp/sudachi/dictionary/build/wordref.csv | 4 +- src/test/resources/dict/lex.csv | 10 +-- 8 files changed, 63 insertions(+), 54 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index 59dda59c..11fb1c5a 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -105,7 +105,7 @@ public void read(String name, InputStream data, POSTable posTable, short numLeft public void read(String name, Reader data, POSTable posTable, short numLeft, short numRight) throws IOException { CSVParser parser = new CSVParser(data); parser.setName(name); - RawLexiconReader reader = new RawLexiconReader(parser, posTable, isUser); + RawLexiconReader reader = new RawLexiconReader(parser, posTable); RawWordEntry entry; while ((entry = reader.nextEntry()) != null) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index 4212eb5a..9214c069 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -73,7 +73,7 @@ private static Column fromString(String str) { private final WordRef.Parser dictRefParser; // for dictionary form private final WordRef.Parser splitParser; // for splits - public RawLexiconReader(CSVParser parser, POSTable pos, boolean user) throws IOException { + public RawLexiconReader(CSVParser parser, POSTable pos) throws IOException { this.parser = parser; this.posTable = pos; resolveColumnLayout(); @@ -83,8 +83,8 @@ public RawLexiconReader(CSVParser parser, POSTable pos, boolean user) throws IOE splitParser = WordRef.parser(pos, true, false, false); } else { normRefParser = WordRef.parser(pos, false, true, false); - dictRefParser = WordRef.parser(pos, !user, false, false); - splitParser = WordRef.parser(pos, !user, false, false); + dictRefParser = WordRef.parser(pos, false, false, false); + splitParser = WordRef.parser(pos, false, false, false); } } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.kt index 27621281..6325a6e5 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.kt @@ -44,10 +44,10 @@ class DictionaryBuilderTest { inputFile .toFile() .writeText( - """東京都,0,0,100,東京都,名詞,固有名詞,地名,一般,*,*,ヒガシキョウト,東京都,*,B,"東,名詞,普通名詞,一般,*,*,*,ヒガシ/2",*,1/2,1/2 -東,-1,-1,200,東,名詞,普通名詞,一般,*,*,*,ヒガシ,ひがし,*,A,*,*,*,* -京都,0,0,300,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*,*""") - val wordIds = listOf(4, 11, 15, 19) // 3 + phantom entry (ひがし) + """東京都,0,0,100,東京都,名詞,固有名詞,地名,一般,*,*,ヒガシキョウト,東京都,,B,"東,名詞,普通名詞,一般,*,*,*,ヒガシ/京都,名詞,固有名詞,地名,一般,*,*,キョウト",,"東,名詞,普通名詞,一般,*,*,*,ヒガシ/京都,名詞,固有名詞,地名,一般,*,*,キョウト",1/2 +東,-1,-1,200,東,名詞,普通名詞,一般,*,*,*,ヒガシ,ひがし,,A,,,, +京都,0,0,300,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,,A,,,,""") + val wordIds = listOf(4, 10, 14, 18) // 3 + phantom entry (ひがし) DictionaryBuilder.main( arrayOf( @@ -124,15 +124,17 @@ class DictionaryBuilderTest { val inputFile = tempDir.resolve("lex.csv") matrixFile.toFile().writeText("1 1\n0 0 200\n") - posFile.toFile().writeText("名詞,普通名詞,一般,*,*,*\n名詞,固有名詞,地名,一般,*,*\n") + posFile + .toFile() + .writeText("pos1,pos2,pos3,pos4,pos5,pos6\n名詞,普通名詞,一般,*,*,*\n名詞,固有名詞,地名,一般,*,*\n") inputFile .toFile() .writeText( """Surface,leftId,rightId,cost,writing,posId,readingform,normalizedform,dictionaryform,mode,splitA,splitB,wordstructure,synonymgroups -東京都,0,0,100,東京都,1,ヒガシキョウト,東京都,,B,"東,名詞,普通名詞,一般,*,*,*,ヒガシ/2",,1/2,1/2 +東京都,0,0,100,東京都,1,ヒガシキョウト,東京都,,B,"東,名詞,普通名詞,一般,*,*,*,ヒガシ/京都,1,キョウト",,"東,名詞,普通名詞,一般,*,*,*,ヒガシ/京都,1,キョウト",1/2 東,-1,-1,200,東,0,ヒガシ,ひがし,,A,,,, 京都,0,0,300,京都,1,キョウト,京都,,A,,,,""") - val wordIds = listOf(4, 11, 15, 19) // 3 + phantom entry (ひがし) + val wordIds = listOf(4, 10, 14, 18) // 3 + phantom entry (ひがし) DictionaryBuilder.main( arrayOf( diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt index 1d792d6b..071dfe60 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt @@ -43,7 +43,7 @@ class RawLexiconReaderTest { @Test fun legacyCsvWithMinimumFields() { - val reader = RawLexiconReader(csvfile("legacy-minimum.csv"), POSTable(), false) + val reader = RawLexiconReader(csvfile("legacy-minimum.csv"), POSTable()) assertNotNull(reader.nextEntry()).let { e -> assertEquals("東京都", e.headword) assertEquals("トウキョウト", e.reading) @@ -57,7 +57,7 @@ class RawLexiconReaderTest { @Test fun legacyCsvWithAllFields() { - val reader = RawLexiconReader(csvfile("legacy-full.csv"), POSTable(), false) + val reader = RawLexiconReader(csvfile("legacy-full.csv"), POSTable()) assertNotNull(reader.nextEntry()).let { e -> assertEquals("東京都", e.headword) assertEquals("トウキョウト", e.reading) @@ -71,13 +71,16 @@ class RawLexiconReaderTest { @Test fun headerCsvMinimumFields() { - val reader = RawLexiconReader(csvfile("headers-minimum.csv"), POSTable(), false) + val reader = RawLexiconReader(csvfile("headers-minimum.csv"), POSTable()) assertNotNull(reader.nextEntry()).let { e -> assertEquals("東京都", e.headword) assertEquals("トウキョウト", e.reading) - assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(9, false)), e.aUnitSplit) - assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(10, false)), e.bUnitSplit) - assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(11, false)), e.wordStructure) + assertEquals( + listOf(WordRef.Triple("東京", 0, "トウキョウ"), WordRef.Triple("都", 1, "ト")), e.aUnitSplit) + assertEquals( + listOf(WordRef.Triple("東京", 0, "トウキョウ"), WordRef.Triple("都", 2, "ト")), e.bUnitSplit) + assertEquals( + listOf(WordRef.Triple("東京", 0, "トウキョウ"), WordRef.Triple("都", 3, "ト")), e.wordStructure) } assertNotNull(reader.nextEntry()) assertNull(reader.nextEntry()) @@ -85,14 +88,18 @@ class RawLexiconReaderTest { @Test fun headerCsvAllFields() { - val reader = RawLexiconReader(csvfile("headers-all.csv"), POSTable(), false) + val reader = RawLexiconReader(csvfile("headers-all.csv"), POSTable()) assertNotNull(reader.nextEntry()).let { e -> assertEquals("東京都", e.headword) assertEquals("トウキョウト", e.reading) - assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(9, false)), e.aUnitSplit) - assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(10, false)), e.bUnitSplit) - assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(11, false)), e.cUnitSplit) - assertEquals(listOf(WordRef.LineNo(6, false), WordRef.LineNo(7, false)), e.wordStructure) + assertEquals( + listOf(WordRef.Triple("東京", 0, "トウキョウ"), WordRef.Triple("都", 1, "ト")), e.aUnitSplit) + assertEquals( + listOf(WordRef.Triple("東京", 0, "トウキョウ"), WordRef.Triple("都", 2, "ト")), e.bUnitSplit) + assertEquals( + listOf(WordRef.Triple("東京", 0, "トウキョウ"), WordRef.Triple("都", 3, "ト")), e.cUnitSplit) + assertEquals( + listOf(WordRef.Triple("東京", 0, "トウキョウ"), WordRef.Triple("都", 4, "ト")), e.wordStructure) assertEquals(Ints.wrap(intArrayOf(8, 9)), e.synonymGroups) assertEquals("10", e.userData) } @@ -115,7 +122,7 @@ class RawLexiconReaderTest { skipVals.removeAt(i) val text = skipCols.joinToString(",") + "\n" + skipVals.joinToString(",") - assertFails { RawLexiconReader(csvtext(text), POSTable(), false) } + assertFails { RawLexiconReader(csvtext(text), POSTable()) } } } @@ -127,7 +134,7 @@ class RawLexiconReaderTest { val posTable = POSTable() posTable.getId(POS("a", "a", "a", "a", "a", "0")) - val reader = RawLexiconReader(csvtext(text), posTable, false) + val reader = RawLexiconReader(csvtext(text), posTable) assertNotNull(reader.nextEntry()).let { e -> assertEquals(0, e.posId) } assertNull(reader.nextEntry()) } @@ -141,7 +148,7 @@ class RawLexiconReaderTest { posTable.getId(POS("a", "a", "a", "a", "a", "0")) assertFails { - val reader = RawLexiconReader(csvtext(text), posTable, false) + val reader = RawLexiconReader(csvtext(text), posTable) reader.nextEntry() } } @@ -153,7 +160,7 @@ class RawLexiconReaderTest { 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,0,トウキョウト,,,,,""" val posTable = POSTable() - val reader = RawLexiconReader(csvtext(text), posTable, false) + val reader = RawLexiconReader(csvtext(text), posTable) assertNotNull(reader.nextEntry()).let { e -> assertEquals(0, e.posId) } assertNull(reader.nextEntry()) } @@ -166,7 +173,7 @@ class RawLexiconReaderTest { val posTable = POSTable() posTable.getId(POS("a", "a", "a", "a", "a", "0")) - val reader = RawLexiconReader(csvtext(text), posTable, false) + val reader = RawLexiconReader(csvtext(text), posTable) assertNotNull(reader.nextEntry()).let { e -> assertEquals(0, e.posId) } assertNull(reader.nextEntry()) } @@ -178,7 +185,7 @@ class RawLexiconReaderTest { 東京都,6,8,5320,0,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,""" val posTable = POSTable() - val reader = RawLexiconReader(csvtext(text), posTable, false) + val reader = RawLexiconReader(csvtext(text), posTable) assertNotNull(reader.nextEntry()).let { e -> assertEquals(0, e.posId) } assertNull(reader.nextEntry()) } @@ -192,7 +199,7 @@ class RawLexiconReaderTest { posTable.getId(POS("a", "a", "a", "a", "a", "0")) assertFails { - val reader = RawLexiconReader(csvtext(text), posTable, false) + val reader = RawLexiconReader(csvtext(text), posTable) reader.nextEntry() } } @@ -206,7 +213,7 @@ class RawLexiconReaderTest { posTable.getId(POS("a", "a", "a", "a", "a", "0")) assertFails { - val reader = RawLexiconReader(csvtext(text), posTable, false) + val reader = RawLexiconReader(csvtext(text), posTable) reader.nextEntry() } } @@ -220,7 +227,7 @@ class RawLexiconReaderTest { posTable.getId(POS("a", "a", "a", "a", "a", "0")) assertFails { - val reader = RawLexiconReader(csvtext(text), posTable, false) + val reader = RawLexiconReader(csvtext(text), posTable) reader.nextEntry() } } @@ -232,21 +239,21 @@ class RawLexiconReaderTest { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure ${oversizeWord},6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,1,,,""" - val reader = RawLexiconReader(csvtext(text), POSTable(), false) + val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } run { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,${oversizeWord},,,1,,,""" - val reader = RawLexiconReader(csvtext(text), POSTable(), false) + val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } run { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,${oversizeWord},,1,,,""" - val reader = RawLexiconReader(csvtext(text), POSTable(), false) + val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } } @@ -256,7 +263,7 @@ ${oversizeWord},6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウ val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure ,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,""" - val reader = RawLexiconReader(csvtext(text), POSTable(), false) + val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } @@ -267,28 +274,28 @@ ${oversizeWord},6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウ val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,1,,,""" - val reader = RawLexiconReader(csvtext(text), POSTable(), false) + val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } run { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,1,,""" - val reader = RawLexiconReader(csvtext(text), POSTable(), false) + val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } run { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,1,""" - val reader = RawLexiconReader(csvtext(text), POSTable(), false) + val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } run { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,,1""" - val reader = RawLexiconReader(csvtext(text), POSTable(), false) + val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } } @@ -302,28 +309,28 @@ ${oversizeWord},6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウ var text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,${oversizeSplit},,,""" - var reader = RawLexiconReader(csvtext(text), POSTable(), false) + var reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } run { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,${oversizeSplit},,""" - val reader = RawLexiconReader(csvtext(text), POSTable(), false) + val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } run { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,${oversizeSplit},""" - val reader = RawLexiconReader(csvtext(text), POSTable(), false) + val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } run { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,,${oversizeSplit}""" - val reader = RawLexiconReader(csvtext(text), POSTable(), false) + val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } } diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv index 9f827c21..3524de33 100644 --- a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv @@ -1,3 +1,3 @@ -Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,mode,splita,splitb,splitc,wordstructure,synonymgroups,userdata,pos_id -東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,B,5/9,5/10,5/11,6/7,8/9,10, -行く,4,4,5105,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,,,A,,,,,,, +Surface,LeftId,RightId,Cost,pos_id,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,mode,splita,splitb,splitc,wordstructure,synonymgroups,userdata +東京都,6,8,5320,,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,B,"東京,0,トウキョウ/都,1,ト","東京,0,トウキョウ/都,2,ト","東京,0,トウキョウ/都,3,ト","東京,0,トウキョウ/都,4,ト",8/9,10 +行く,4,4,5105,,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,,,A,,,,,, diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-minimum.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-minimum.csv index f2fc1b17..95fca9d9 100644 --- a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-minimum.csv +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-minimum.csv @@ -1,3 +1,3 @@ Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,wordStructure -東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,5/9,5/10,5/11 +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,"東京,0,トウキョウ/都,1,ト","東京,0,トウキョウ/都,2,ト","東京,0,トウキョウ/都,3,ト" 行く,4,4,5105,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,,,,, diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/wordref.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/wordref.csv index 5c4e81d2..edc97876 100644 --- a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/wordref.csv +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/wordref.csv @@ -2,8 +2,8 @@ Surface,LeftId,RightId,Cost,pos_id,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,No 京都,0,0,5293,0,名詞,固有名詞,地名,一般,*,*,キョウト,,,A,,,,, 東,1,1,4675,1,名詞,普通名詞,一般,*,*,*,ヒガシ,,,A,,,,, 東京,0,0,2816,0,名詞,固有名詞,地名,一般,*,*,トウキョウ,,,A,,,,, -トウキョウ,0,0,2816,0,名詞,固有名詞,地名,一般,*,*,トウキョウ,2,2,A,,,,, +トウキョウ,0,0,2816,0,名詞,固有名詞,地名,一般,*,*,トウキョウ,"東京,0,トウキョウ","東京,0,トウキョウ",A,,,,, トウキョウ,1,1,3000,2,名詞,固有名詞,一般,*,*,*,トウキョウ,,,A,,,,, 都,2,2,2914,1,名詞,普通名詞,一般,*,*,*,ト,都,,A,,,,, -東京都,0,2,5320,0,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,B,2/5,,,, +東京都,0,2,5320,0,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,B,"東京,0,トウキョウ/都,1,ト",,,, 東トウキョウ,0,1,5320,2,名詞,固有名詞,一般,*,*,*,ヒガシトウキョウ,,,C,,,"東,名詞,普通名詞,一般,*,*,*,ヒガシ/トウキョウ,2,トウキョウ",, diff --git a/src/test/resources/dict/lex.csv b/src/test/resources/dict/lex.csv index c1abba0d..e23d2e7e 100644 --- a/src/test/resources/dict/lex.csv +++ b/src/test/resources/dict/lex.csv @@ -5,9 +5,9 @@ Surface,LeftId,RightId,Cost,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalize 京都,6,6,5293,名詞,固有名詞,地名,一般,*,*,キョウト,京都,,,,,,1/5 東,7,7,4675,名詞,普通名詞,一般,*,*,*,ヒガシ,東,,,,,, 東京,6,6,2816,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,,,,,, -東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,,5/9,,,5/9, +東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,,"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/都,名詞,普通名詞,一般,*,*,*,ト",,,"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/都,名詞,普通名詞,一般,*,*,*,ト", 行く,4,4,5105,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,,,,,, -行っ,5,5,5122,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,7,,,,, +行っ,5,5,5122,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,"行く,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク",,,,, 都,8,8,2914,名詞,普通名詞,一般,*,*,*,ト,都,,,,,, アイ,7,7,4675,名詞,普通名詞,一般,*,*,*,アイ,アイ,,,,,, アイウ,7,7,4675,名詞,普通名詞,一般,*,*,*,アイウ,アイウ,,,,,, @@ -34,9 +34,9 @@ Surface,LeftId,RightId,Cost,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalize 九,9,9,2478,名詞,数詞,*,*,*,*,キュウ,九,,,,,, 六三四,6,6,0,名詞,固有名詞,地名,一般,*,*,ムサシ,六三四,,,,,, いく,4,4,5105,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,,,,,, -いっ,5,5,5122,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,,,,, +いっ,5,5,5122,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,"行く,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク","いく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク",,,,, 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,-9000,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,,,,,, 特a,8,8,2914,名詞,普通名詞,一般,*,*,*,トクエー,特a,,,,,, 隠し,-1,-1,0,名詞,普通名詞,一般,*,*,*,カクシ,隠し,,,,,, -な。な,8,8,2914,名詞,普通名詞,一般,*,*,*,ナナ,な。な,,11,11,,, -東東京都,6,8,6320,名詞,固有名詞,地名,一般,*,*,トウトウキョウト,東東京都,,,,4/4/3,, +な。な,8,8,2914,名詞,普通名詞,一般,*,*,*,ナナ,な。な,,"アイウ,名詞,普通名詞,一般,*,*,*,アイウ","アイウ,名詞,普通名詞,一般,*,*,*,アイウ",,, +東東京都,6,8,6320,名詞,固有名詞,地名,一般,*,*,ヒガシヒガシキョウト,東東京都,,,,"東,名詞,普通名詞,一般,*,*,*,ヒガシ/東,名詞,普通名詞,一般,*,*,*,ヒガシ/京都,名詞,固有名詞,地名,一般,*,*,キョウト",, From cf199af9a14eebac301c98090d2a1f3f143461bd Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Thu, 17 Oct 2024 18:07:49 +0900 Subject: [PATCH 87/94] fix user_dict doc and migration guide --- docs/migrate_legacy_dictionary_v1.md | 37 +++---- docs/user_dict.md | 146 +++++++++++++++------------ 2 files changed, 102 insertions(+), 81 deletions(-) diff --git a/docs/migrate_legacy_dictionary_v1.md b/docs/migrate_legacy_dictionary_v1.md index 389dfe24..4c07dbf8 100644 --- a/docs/migrate_legacy_dictionary_v1.md +++ b/docs/migrate_legacy_dictionary_v1.md @@ -1,7 +1,7 @@ -# 旧形式のユーザー辞書からの移行方法 (V1) +# 旧形式のユーザー辞書からの移行手順 (V1) 旧形式(レガシー形式)の Sudachi バイナリ辞書は現在の Sudachi では利用できません。 -辞書ソースである lexicon CSV ファイルについては旧形式のものも利用可能ですが、記述方法が変更されています。 +辞書ソースである lexicon CSV ファイルについても、記述方法が変更されています(旧形式のものも利用可能です)。 本文書では旧型式の Sudachi 辞書から新形式 (V1) に移行するための手順を記述します。 @@ -19,8 +19,8 @@ #### ヘッダー -lexicon の 1 行目をヘッダー行とし、記述する項目の種類と順序を指定します。 -カラム名については大文字小文字および "\_" の有無は無視して処理されます。 +lexicon の 1 行目にヘッダー行を追加し、記述する項目の種類と順序を指定します。 +項目名については大文字小文字および "\_" の有無は無視して処理されます。 旧形式に対応するヘッダー行は以下になります。 @@ -30,8 +30,10 @@ SURFACE,LEFT_ID,RIGHT_ID,COST,WRITING,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM #### 空項目 -旧形式では項目の値が空の場合、"\*" を指定していましたが、新形式では空文字列とする必要があります。 -なお品詞項目の "\*" は空値とは異なるため空にはできません。 +旧形式では項目の値がない場合、"\*" を指定していましたが、新形式では空文字列とする必要があります。 +辞書形 ID、A/B/C 単位分割情報、第 17 項目(現 Word_Structure)について、"\*" を空文字列に置き換えます。 + +なお品詞項目の "\*" は空値とは異なるためそのままとしてください。 #### 語参照 @@ -44,18 +46,19 @@ SURFACE,LEFT_ID,RIGHT_ID,COST,WRITING,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM 移行用スクリプト [`migrate_legacy_user_lexicon_v1.sh`](../scripts/migrate_legacy_user_lexicon_v1.sh) が利用できます。 -実行には、参照するシステムバイナリ辞書を指定する必要があります。 +スクリプトの実行には、参照するシステムバイナリ辞書を指定する必要があります。 別途[配布ページ](http://sudachi.s3-website-ap-northeast-1.amazonaws.com/sudachidict/)から取得してください。 -lexicon 内でシステム辞書内の語を参照している場合、以前と別のシステム辞書を用いると、参照先がずれる可能性があります。 +旧形式のソースファイル内でシステム辞書内の語を参照している場合、以前と異なるシステム辞書を用いると、参照先がずれる可能性があります。 このスクリプトの使用においては、対象 lexicon ファイルの作成時に参照したバージョンのシステム辞書を指定してください。 + 新形式へ変換した lexicon ファイルは任意のバージョンのシステム辞書と共に使用可能になります。 -例: +例: `dict/system.dic` を参照し、`old_lexicon.csv` を変換したものを `new_lexicon.csv` に出力する ```bash cd /path/to/sudachi -./scripts/migrate_legacy_user_lexicon_v1.sh old_lexicon.csv /path/to/system.dic > new_lexicon.csv +./scripts/migrate_legacy_user_lexicon_v1.sh old_lexicon.csv ./dict/system.dic > new_lexicon.csv ``` ## バイナリ辞書 @@ -66,27 +69,27 @@ cd /path/to/sudachi ### 1. lexicon CSV からの移行 対象のバイナリ辞書に対応する辞書ソースである lexicon CSV ファイルが存在する場合は、それを元に新形式のバイナリ辞書をビルドすることができます。 -上記の辞書ソースファイルの移行方法に従い、新形式の lexicon ファイルに変換したのち、新形式でのビルドを行ってください。 +上記の辞書ソースファイルの移行方法に従い、新形式のソースファイルに変換したのち、新形式でのビルドを行ってください。 旧形式の lexicon ファイルからでもビルド可能ですが、非推奨です。 -旧形式の lexicon ファイル内でシステム辞書内の語を参照している場合、異なるシステム辞書を用いると、参照先がずれる可能性があります。 +旧形式のソースファイル内でシステム辞書内の語を参照している場合、以前と異なるシステム辞書を用いると、参照先がずれる可能性があります。 ユーザー辞書のビルドでは、参照するシステムバイナリ辞書を指定する必要があります。 別途[配布ページ](http://sudachi.s3-website-ap-northeast-1.amazonaws.com/sudachidict/)から取得してください。 -例: `sudachi/system.dic` を参照し、`user_lexicon.csv` からバイナリユーザー辞書 `new_user.dic` をビルドする +例: `dict/system.dic` を参照し、`user_lexicon.csv` からバイナリユーザー辞書 `new_user.dic` をビルドする ```bash unzip -d "./sudachi" "./build/distributions/sudachi-executable-1.0.0.zip" java -Dfile.encoding=UTF-8 \ -cp ./sudachi/sudachi-1.0.0.jar \ com.worksap.nlp.sudachi.dictionary.UserDictionaryBuilder \ - -s ./sudachi/system.dic -o new_user.dic user_lexicon.csv + -s ./dict/system.dic -o new_user.dic user_lexicon.csv ``` ### 2. バイナリ辞書からの移行 -バイナリ辞書のみが存在する場合、現在の Sudachi では移行ができません。 +バイナリ辞書のみが存在する場合、現在の Sudachi バージョンでは移行ができません。 過去バージョンの Sudachi にて、DictionaryPrinter を用いて辞書ソースファイルへの変換を行ってください。 これは旧形式での出力となるため、さらに上記の辞書ソースファイルの移行も必要となります。 @@ -96,12 +99,12 @@ java -Dfile.encoding=UTF-8 \ ユーザー辞書内でシステム辞書内の語を参照している場合、ビルド時と異なるシステム辞書を用いると、参照先がずれる可能性があります。 対象バイナリ辞書のビルドの際に参照したシステム辞書を指定するようにしてください。 -例: `sudachi/system.dic` を参照し、バイナリユーザー辞書 `user.dic` の語を `user_lexicon.csv` に出力する +例: `dict/system.dic` を参照し、バイナリユーザー辞書 `user.dic` の語を `user_lexicon.csv` に出力する ```bash unzip -d "./sudachi" "./build/distributions/sudachi-executable-0.8.0.zip" java -Dfile.encoding=UTF-8 \ -cp ./sudachi/sudachi-0.8.0.jar \ com.worksap.nlp.sudachi.dictionary.DictionaryPrinter \ - -s ./sudachi/system.dic -o user_lexicon.csv user.dic \ + -s ./dict/system.dic -o user_lexicon.csv user.dic \ ``` diff --git a/docs/user_dict.md b/docs/user_dict.md index 9010cb53..c8df86a3 100644 --- a/docs/user_dict.md +++ b/docs/user_dict.md @@ -4,56 +4,60 @@ Sudachi ではユーザー辞書をもちいて、システム辞書で不足し ## ユーザー辞書ソースのフォーマット -ユーザー辞書の作成は、登録したい見出しを記述したユーザー辞書ソースを用いて行います。 -ユーザー辞書ソースのフォーマットは以下の通りです。 -なお、このファイルは、CSV 形式 (RFC 4180) で保存します。文字コードは、UTF-8 を使用します。 - -- 0 Surface: 見出し表記 -- 1 Left_Id: 左連接 ID -- 2 Right_Id: 右連接 ID -- 3 Cost: コスト -- 4 Writing: 見出し (解析結果表示用) -- 5 POS1: 品詞 1 -- 6 POS2: 品詞 2 -- 7 POS3: 品詞 3 -- 8 POS4: 品詞 4 -- 9 POS5: 品詞 (活用型) -- 10 POS6: 品詞 (活用形) -- 11 Reading_Form: 読み -- 12 Normalized_Form: 正規化表記 -- 13 Dictionary_Form: 辞書形 -- 14 Mode: 分割タイプ -- 15 Split_A: A 単位分割情報 -- 16 Split_B: B 単位分割情報 -- 17 Word_Structure: 語構成 -- 18 Synonym_Groups: 同義語グループ ID 情報 -- 19 Split_C: C 単位分割情報 -- 20 User_Data: ユーザーデータ -- 21 POS_Id: 品詞 ID +ユーザー辞書の作成は、登録したい語を記述したユーザー辞書ソースを用いて行います。 +このファイルは、CSV 形式 (RFC 4180) で保存します。文字コードは、UTF-8 を使用します。 +ユーザー辞書ソースに記載できる項目は以下の通りです。 + +- [0 Surface: 見出し表記](#0-surface-見出し表記) +- [1 Left_Id: 左連接 ID](#1-left_id-左連接-id) +- [2 Right_Id: 右連接 ID](#2-right_id-右連接-id) +- [3 Cost: コスト](#3-cost-コスト) +- [4 Writing: 見出し (解析結果表示用)](#4-writing-見出し-解析結果表示用) +- [5 POS1: 品詞 1](#5-pos1-品詞-1) +- [6 POS2: 品詞 2](#6-pos2-品詞-2) +- [7 POS3: 品詞 3](#7-pos3-品詞-3) +- [8 POS4: 品詞 4](#8-pos4-品詞-4) +- [9 POS5: 品詞 (活用型)](#9-pos5-品詞-活用型) +- [10 POS6: 品詞 (活用形)](#10-pos6-品詞-活用形) +- [11 Reading_Form: 読み](#11-reading_form-読み) +- [12 Normalized_Form: 正規化形情報](#12-normalized_form-正規化形情報) +- [13 Dictionary_Form: 辞書形情報](#13-dictionary_form-辞書形情報) +- [14 Mode: 分割タイプ](#14-mode-分割タイプ) +- [15 Split_A: A 単位分割情報](#15-split_a-a-単位分割情報) +- [16 Split_B: B 単位分割情報](#16-split_b-b-単位分割情報) +- [17 Word_Structure: 語構成情報](#17-wordstructure-語構成情報) +- [18 Synonym_Groups: 同義語グループ ID 情報](#18-synonym_groups-同義語グループ-id) +- [19 Split_C: C 単位分割情報](#19-split_c-c-単位分割情報) +- [20 User_Data: ユーザーデータ](#20-user_data-ユーザーデータ) +- [21 POS_Id: 品詞 ID](#21-pos_id-品詞-id) 各項目については以下に説明します。 ### ヘッダー行 ファイルの一行目はヘッダーを記述します。 -ヘッダー行に記述されたカラムの順序でファイル全体がパースされます。 -カラム名の記述では、"\_" の有無および大文字・小文字の違いは無視されます。 +ヘッダー行に記述された項目の順序でファイル全体がパースされます。 +項目名の記述では、"\_" の有無および大文字・小文字の違いは無視されます。 ヘッダー行が検出されなかった場合は、[旧辞書フォーマット](./user_dict_legacy.md)に従って処理されます。 -ただし、この場合でも上記のカラム順に従えばすべてのカラムを記述できます。 -旧フォーマットにないカラムについてはこのドキュメントを参照してください。 +ただし、この場合でも上記の順序に従えばすべての項目を記述できます。 +旧フォーマットにない項目についてはこのドキュメントを参照してください。 -カラムのうち、Writing、Mode、Synonym_Groups、Split_C、User_Data は非必須項目です。 -また、POS_Id と POS1 - POS6 の組はいずれか一方のみが必須となります。 +項目のうち、Writing、Mode、Synonym_Groups、Split_C、User_Data は省略可能です。 +また、POS1 - POS6 の組と POS_Id の少なくとも一方の記載が必須です。 +その他の項目は必須です。 ### 語参照 一部の項目では、辞書内の他の語への参照を記述することがあります。 以下ではこれを語参照と呼称します。 -語参照は対象語の「見出し表記、品詞 1-4、品詞 (活用型)、品詞 (活用形)、読み」もしくは「見出し表記、品詞 ID、読み」を "," (カンマ) で区切った文字列で記述します。 +語参照は対象語の「[見出し表記](#0-surface-見出し表記), 品詞, [読み](#11-reading_form-読み)」を "," (カンマ) で区切った文字列で記述します。 +品詞は [POS1-POS6 の6つ組](#5-pos1-品詞-1) もしくは [POS_Id](#21-pos_id-品詞-id) のどちらかを使用します。 語参照を記述するときはその項目のフィールド全体を " (ダブルクォーテーション) で囲む必要があります。 -語参照の対象語は別途記述されている必要があります。対象語がシステム辞書内にあるかユーザー辞書内にあるかは自動的に判別します。 + +語参照の対象語は別途記述されている必要があります。 +対象語がシステム辞書内にあるかユーザー辞書内にあるかは自動的に判別します(システム辞書のものが優先されます)。 ### 0 Surface: 見出し表記 @@ -118,88 +122,98 @@ Sudachi では、以下の文字正規化を行っています。挙動の詳細 ### 4 Writing: 見出し (解析結果表示用) この項目は使用されません。 +記載を省略することができます。 -空文字列とするか、見出し表記と同じものを記述してください。 +項目を省略しない場合は、空欄とするか、見出し表記と同じものを記述してください。 ### 5 POS1: 品詞 1 +品詞の6つ組のうち第1要素を記述します。 システム辞書の品詞、あるいはユーザー定義の任意の品詞を記述できます。 + システム辞書で使用する品詞は、[pos.csv](../src/main/resources/pos.csv) を参照してください。 +これは ID をのぞいて unidic-mecab 2.1.2 の 品詞体系を参照しています。 + +項目 POS1 から POS6 は6つ組で記述します。 +これらを記述する場合は、項目 POS_Id を省略、もしくは空欄とすることができます。 +両方を記述する場合は、両者の内容が一致している必要があります。 ### 6 POS2: 品詞 2 -システム辞書の品詞、あるいはユーザー定義の任意の品詞を記述できます。 -システム辞書で使用する品詞は、[pos.csv](../src/main/resources/pos.csv) を参照してください。 +品詞の6つ組のうち第2要素を記述します。 +POS1: 品詞 1 を参照してください。 ### 7 POS3: 品詞 3 -システム辞書の品詞、あるいはユーザー定義の任意の品詞を記述できます。 -システム辞書で使用する品詞は、[pos.csv](../src/main/resources/pos.csv) を参照してください。 +品詞の6つ組のうち第3要素を記述します。 +POS1: 品詞 1 を参照してください。 ### 8 POS4: 品詞 4 -システム辞書の品詞、あるいはユーザー定義の任意の品詞を記述できます。 -システム辞書で使用する品詞は、[pos.csv](../src/main/resources/pos.csv) を参照してください。 +品詞の6つ組のうち第4要素を記述します。 +POS1: 品詞 1 を参照してください。 ### 9 POS5: 品詞 (活用型) -システム辞書の品詞、あるいはユーザー定義の任意の品詞を記述できます。 -システム辞書で使用する品詞は、[pos.csv](../src/main/resources/pos.csv) を参照してください。 +品詞の6つ組のうち第5要素を記述します。 +POS1: 品詞 1 を参照してください。 ### 10 POS6: 品詞 (活用形) -システム辞書の品詞、あるいはユーザー定義の任意の品詞を記述できます。 -システム辞書で使用する品詞は、[pos.csv](../src/main/resources/pos.csv) を参照してください。 +品詞の6つ組のうち第6要素を記述します。 +POS1: 品詞 1 を参照してください。 ### 21 POS_Id: 品詞 ID -システム辞書の品詞、あるいはユーザー定義の任意の品詞の ID を記述できます。 -品詞 ID の値は辞書内のものに依存します。 -ユーザ定義の品詞での利用は推奨しません。 - +システム辞書の品詞、あるいはユーザー定義の任意の品詞を記述できます。 システム辞書で使用する品詞は、[pos.csv](../src/main/resources/pos.csv) を参照してください。 これは ID をのぞいて unidic-mecab 2.1.2 の 品詞体系を参照しています。 この項目を記述する場合、POS1 - POS6 の 6 項目を省略、もしくは空欄とすることができます。 両方を記述する場合は、両者の内容が一致している必要があります。 +ユーザ定義の品詞を記述する場合は、この項目は空欄とすることを推奨します。 + ### 11 Reading_Form: 読み 見出し表記の読みを記述します。 -全角カタカナで記述します。 -省略することもできます。(その場合は、何も記述しない) -### 12 Normalized_Form: 正規化形 ID +全角カタカナもしくは記号での記述を推奨します。 +空欄としても構いません。 -表記にぶれのある語に対して、その語の正規化形を指定するための情報です。 +### 12 Normalized_Form: 正規化形情報 -対象となる語への語参照もしくは見出し表記を記述します。 +表記にぶれのある語に対して、その語の正規化形を指定するための情報です。 -見出し表記での記述では、対象となる語が記述されていない場合でも文字列のみを正規化形として登録します。 +対象となる語への[語参照](#語参照)もしくはその語の[見出し表記](#0-surface-見出し表記)を記述します。 -「見出し表記=正規化表記」の場合は、空文字列とすることができます。 +見出し表記のみで記述された場合、対象となる語が記述されていない場合でも文字列のみを正規化形として登録します。 +「見出し表記=正規化表記」である場合は、空欄とすることができます。 -### 13 Dictionary_Form: 辞書形 ID +### 13 Dictionary_Form: 辞書形情報 活用のある語に対して、その語の辞書形(終止形表記)を指定するための情報です。 -対象となる語への語参照を記述します。 +対象となる語への[語参照](#語参照)を記述します。 -活用のない語については、このフィールドは空文字列としてください。 +活用のない語については、この項目は空欄にしてください。 -### 14 分割タイプ +### 14 Mode: 分割タイプ この項目は使用されません。 +記載を省略することができます。 -語の分割単位タイプ (A / B / C) を記述します。後述の分割情報を記述しない場合は "\*" (半角アスタリスク) もしくは空文字列でもかまいません。 +項目を省略しない場合は、語の分割単位タイプ (A / B / C) を記述します。 +後述の分割情報を記述しない場合は空文字列でもかまいません。 ### 15 Split_A: A 単位分割情報 分割単位タイプ B または C の語について、A 単位に分割するための情報です。 -構成語への語参照を "/" (半角スラッシュ) で区切って記述します。 +構成語への[語参照](#語参照)を "/" (半角スラッシュ) で区切って記述します。 +分割を行わない語については、空欄としてください。 -なお構成語としてのみ利用される語は連接 ID に-1 を記述すると、単独の語として出現しなくなります。 +なお構成語としてのみ利用される語は、[連接 ID](#1-left_id-左連接-id) に -1 を記述すると、単独の語として出現しなくなります。 ### 16 Split_B: B 単位分割情報 @@ -210,10 +224,11 @@ Sudachi では、以下の文字正規化を行っています。挙動の詳細 ### 19 Split_C: C 単位分割情報 分割単位タイプ C よりも長い語句を C 単位への分割情報と共に登録する際に使用します。 +記載を省略することができます。 フォーマットは A 単位分割情報と同じです。 -この項目が登録された語句は、解析後に自動的に C 単位(A/B が指定されている場合はそちら)に分割されます。 +この項目が登録された語句は、解析後に自動的に指定の分割単位に分割されます。 ### 17 WordStructure: 語構成情報 @@ -224,12 +239,15 @@ Sudachi では、以下の文字正規化を行っています。挙動の詳細 ### 18 Synonym_Groups: 同義語グループ ID Sudachi 同義語辞書における同義語グループ ID 情報です。 +記載を省略することができます。 対象となる同義語グループ ID を "/" (半角スラッシュ) で区切って記述します。 +グループが存在しない場合は、空欄としてください。 ### 20: User_Data: ユーザーデータ 語に対して任意の文字列を登録します。 +記載を省略することができます。 表記の長さは、32767 文字まで登録できます。 From 99d98113415e2bee76b58b5bf98db2dbdc119624 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 15 Nov 2024 13:58:45 +0900 Subject: [PATCH 88/94] use Collectors.joining --- .../nlp/sudachi/dictionary/DictionaryPrinter.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index eb1ddfe8..4b996e08 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -250,8 +250,8 @@ String wordRef(int wordId) { parts.add(reading); } - return String.join(String.valueOf(WordRef.Parser.WORDREF_DELIMITER), - parts.stream().map(this::maybeEscapeRefPart).collect(Collectors.toList())); + return parts.stream().map(this::maybeEscapeRefPart) + .collect(Collectors.joining(String.valueOf(WordRef.Parser.WORDREF_DELIMITER))); } /** encode word entry pointed by the wordId as WordRef.Headword. */ @@ -265,12 +265,12 @@ String wordRefHeadword(int wordId, int reference) { } String wordRefList(int[] wordIds) { - return String.join(String.valueOf(RawLexiconReader.LIST_DELIMITER), - Arrays.stream(wordIds).boxed().map(this::wordRef).collect(Collectors.toList())); + return Arrays.stream(wordIds).boxed().map(this::wordRef) + .collect(Collectors.joining(String.valueOf(RawLexiconReader.LIST_DELIMITER))); } String intList(int[] ints) { - return String.join("/", Arrays.stream(ints).boxed().map(Object::toString).collect(Collectors.toList())); + return Arrays.stream(ints).boxed().map(Object::toString).collect(Collectors.joining("/")); } private static boolean hasCh(String value, int ch) { From d54c47a03f427ada34eab0be56a650cb85c2b5ac Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 15 Nov 2024 14:28:52 +0900 Subject: [PATCH 89/94] fix / revert --- docs/user_dict.md | 20 +++++++++---------- docs/user_dict_legacy.md | 20 +++++++++---------- .../sudachi/dictionary/build/BufWriter.java | 15 +++++++++++--- .../dictionary/build/ProgressInputStream.java | 6 +++--- .../nlp/sudachi/RegexOovProviderTest.kt | 1 + 5 files changed, 36 insertions(+), 26 deletions(-) diff --git a/docs/user_dict.md b/docs/user_dict.md index a8f5d091..33dde870 100644 --- a/docs/user_dict.md +++ b/docs/user_dict.md @@ -85,14 +85,14 @@ Sudachi では、以下の文字正規化を行っています。挙動の詳細 普通名詞の登録であれば、以下のいずれかを推奨 -- 5146 名詞,普通名詞,一般,_,_,_,_,\*,漢 -- 5133 名詞,普通名詞,サ変可能,_,_,_,_,\*,漢 +- 5146 名詞,普通名詞,一般,\*,\*,\*,\*,\*,漢 +- 5133 名詞,普通名詞,サ変可能,\*,\*,\*,\*,\*,漢 固有名詞の登録であれば、以下のいずれかを推奨 -- 4786 名詞,固有名詞,一般,_,_,_,_,\*,固 -- 4789 名詞,固有名詞,人名,名,_,_,_,_,固 -- 4790 名詞,固有名詞,人名,姓,_,_,_,_,固 +- 4786 名詞,固有名詞,一般,\*,\*,\*,\*,\*,固 +- 4789 名詞,固有名詞,人名,名,\*,\*,\*,\*,固 +- 4790 名詞,固有名詞,人名,姓,\*,\*,\*,\*,固 ### 2 Right_Id: 右連接 ID @@ -101,14 +101,14 @@ Sudachi では、以下の文字正規化を行っています。挙動の詳細 普通名詞の登録であれば、以下のいずれかを推奨 -- 5146 名詞,普通名詞,一般,_,_,_,_,\*,漢 -- 5133 名詞,普通名詞,サ変可能,_,_,_,_,\*,漢 +- 5146 名詞,普通名詞,一般,\*,\*,\*,\*,\*,漢 +- 5133 名詞,普通名詞,サ変可能,\*,\*,\*,\*,\*,漢 固有名詞の登録であれば、以下のいずれかを推奨 -- 4786 名詞,固有名詞,一般,_,_,_,_,\*,固 -- 4789 名詞,固有名詞,人名,名,_,_,_,_,固 -- 4790 名詞,固有名詞,人名,姓,_,_,_,_,固 +- 4786 名詞,固有名詞,一般,\*,\*,\*,\*,\*,固 +- 4789 名詞,固有名詞,人名,名,\*,\*,\*,\*,固 +- 4790 名詞,固有名詞,人名,姓,\*,\*,\*,\*,固 ### 3 Cost: コスト diff --git a/docs/user_dict_legacy.md b/docs/user_dict_legacy.md index 3d881349..c5b641ab 100644 --- a/docs/user_dict_legacy.md +++ b/docs/user_dict_legacy.md @@ -55,14 +55,14 @@ Sudachi では、以下の文字正規化を行っています。挙動の詳細 普通名詞の登録であれば、以下のいずれかを推奨 -- 5146 名詞,普通名詞,一般,_,_,_,_,\*,漢 -- 5133 名詞,普通名詞,サ変可能,_,_,_,_,\*,漢 +- 5146 名詞,普通名詞,一般,\*,\*,\*,\*,\*,漢 +- 5133 名詞,普通名詞,サ変可能,\*,\*,\*,\*,\*,漢 固有名詞の登録であれば、以下のいずれかを推奨 -- 4786 名詞,固有名詞,一般,_,_,_,_,\*,固 -- 4789 名詞,固有名詞,人名,名,_,_,_,_,固 -- 4790 名詞,固有名詞,人名,姓,_,_,_,_,固 +- 4786 名詞,固有名詞,一般,\*,\*,\*,\*,\*,固 +- 4789 名詞,固有名詞,人名,名,\*,\*,\*,\*,固 +- 4790 名詞,固有名詞,人名,姓,\*,\*,\*,\*,固 ### 2 右連接 ID @@ -71,14 +71,14 @@ Sudachi では、以下の文字正規化を行っています。挙動の詳細 普通名詞の登録であれば、以下のいずれかを推奨 -- 5146 名詞,普通名詞,一般,_,_,_,_,\*,漢 -- 5133 名詞,普通名詞,サ変可能,_,_,_,_,\*,漢 +- 5146 名詞,普通名詞,一般,\*,\*,\*,\*,\*,漢 +- 5133 名詞,普通名詞,サ変可能,\*,\*,\*,\*,\*,漢 固有名詞の登録であれば、以下のいずれかを推奨 -- 4786 名詞,固有名詞,一般,_,_,_,_,\*,固 -- 4789 名詞,固有名詞,人名,名,_,_,_,_,固 -- 4790 名詞,固有名詞,人名,姓,_,_,_,_,固 +- 4786 名詞,固有名詞,一般,\*,\*,\*,\*,\*,固 +- 4789 名詞,固有名詞,人名,名,\*,\*,\*,\*,固 +- 4790 名詞,固有名詞,人名,姓,\*,\*,\*,\*,固 ### 3 コスト diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java index 6443b2fc..9fcb99a6 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BufWriter.java @@ -90,15 +90,24 @@ private void putVarintSlow(long val) { putByte((byte) val); } - /** Encode int array of fixed length. */ - public BufWriter putInts(Ints value, int length) { + /** + * Encode int array of fixed length. This does not put the number of values and + * is no-op if the length is 0. The length should be known or kept in some way + * to read them safely. + * + * @param values + * list of ints to put. It must have enough number of values. + * @param length + * number of ints to put. noop if this is less than 1. + */ + public BufWriter putInts(Ints values, int length) { if (length <= 0) { return this; } ByteBuffer buf = buffer; // read field only once int pos = buf.position(); for (int i = 0; i < length; ++i) { - buf.putInt(pos + i * 4, value.get(i)); + buf.putInt(pos + i * 4, values.get(i)); } buf.position(pos + length * 4); return this; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ProgressInputStream.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ProgressInputStream.java index a31f0560..2cf4cc5c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ProgressInputStream.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/ProgressInputStream.java @@ -68,10 +68,10 @@ public void close() throws IOException { @Override public int read() throws IOException { - int read = inner.read(); - if (read != -1) { + int nread = inner.read(); + if (nread != -1) { position += 1; } - return read; + return nread; } } diff --git a/src/test/java/com/worksap/nlp/sudachi/RegexOovProviderTest.kt b/src/test/java/com/worksap/nlp/sudachi/RegexOovProviderTest.kt index 57c20ddd..f53de5b6 100644 --- a/src/test/java/com/worksap/nlp/sudachi/RegexOovProviderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/RegexOovProviderTest.kt @@ -57,6 +57,7 @@ class RegexOovProviderTest { assertEquals(3, tokens.size) assertEquals("XAG-2F", tokens[2].surface()) assertEquals("xag-2f", tokens[2].normalizedForm()) + assertEquals("xag-2f", tokens[2].readingForm()) } @Test From 77f9474b6adb259bb1345dbee9fe4e6e599503ab Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 18 Nov 2024 10:33:32 +0900 Subject: [PATCH 90/94] save writing instead of surface --- docs/user_dict.md | 43 ++++++++++--------- .../dictionary/build/CompiledWordEntry.java | 2 +- .../sudachi/dictionary/build/EntryLookup.java | 12 +++--- .../sudachi/dictionary/build/RawLexicon.java | 2 +- .../dictionary/build/RawLexiconReader.java | 10 +++-- .../dictionary/build/RawWordEntry.java | 24 ++++++++--- .../dictionary/build/WordEntryLayout.java | 20 +++------ .../dictionary/build/RawLexiconReaderTest.kt | 40 ++++++++++++++--- .../sudachi/dictionary/build/headers-all.csv | 6 +-- .../sudachi/dictionary/build/legacy-full.csv | 2 +- .../dictionary/build/legacy-minimum.csv | 2 +- 11 files changed, 99 insertions(+), 64 deletions(-) diff --git a/docs/user_dict.md b/docs/user_dict.md index 33dde870..8f9c0078 100644 --- a/docs/user_dict.md +++ b/docs/user_dict.md @@ -8,11 +8,11 @@ Sudachi ではユーザー辞書をもちいて、システム辞書で不足し このファイルは、CSV 形式 (RFC 4180) で保存します。文字コードは、UTF-8 を使用します。 ユーザー辞書ソースに記載できる項目は以下の通りです。 -- [0 Surface: 見出し表記](#0-surface-見出し表記) +- [0 Surface: 解析用見出し](#0-surface-解析用見出し) - [1 Left_Id: 左連接 ID](#1-left_id-左連接-id) - [2 Right_Id: 右連接 ID](#2-right_id-右連接-id) - [3 Cost: コスト](#3-cost-コスト) -- [4 Writing: 見出し (解析結果表示用)](#4-writing-見出し-解析結果表示用) +- [4 Writing: 表記形](#4-writing-表記形) - [5 POS1: 品詞 1](#5-pos1-品詞-1) - [6 POS2: 品詞 2](#6-pos2-品詞-2) - [7 POS3: 品詞 3](#7-pos3-品詞-3) @@ -52,21 +52,24 @@ Sudachi ではユーザー辞書をもちいて、システム辞書で不足し 一部の項目では、辞書内の他の語への参照を記述することがあります。 以下ではこれを語参照と呼称します。 -語参照は対象語の「[見出し表記](#0-surface-見出し表記), 品詞, [読み](#11-reading_form-読み)」を "," (カンマ) で区切った文字列で記述します。 +語参照は対象語の「[表記形](#4-writing-表記形), 品詞, [読み](#11-reading_form-読み)」を "," (カンマ) で区切った文字列で記述します。 品詞は [POS1-POS6 の6つ組](#5-pos1-品詞-1) もしくは [POS_Id](#21-pos_id-品詞-id) のどちらかを使用します。 + +例: `東京,名詞,固有名詞,地名,一般,*,*,トウキョウ`, `東京,1349,トウキョウ` + 語参照を記述するときはその項目のフィールド全体を " (ダブルクォーテーション) で囲む必要があります。 語参照の対象語は別途記述されている必要があります。 対象語がシステム辞書内にあるかユーザー辞書内にあるかは自動的に判別します(システム辞書のものが優先されます)。 -### 0 Surface: 見出し表記 +### 0 Surface: 解析用見出し 形態素解析に使用される見出し表記です。 表記の長さは、255 文字まで登録できます。 #### 文字正規化 -見出しは、「Sudachi の文字正規化がおこなわれた後の形」で登録してください。 +解析用見出しは、「Sudachi の文字正規化がおこなわれた後の形」で登録してください。 Sudachi では、文字正規化が行われた**後に**見出しを引きます。そのため、「正規化後に現れない形」で見出しが表記されている場合、その語はどのような場合でもマッチすることがありません。例えば、「ラテン文字の大文字」で見出しを表記した場合、Sudachi 内部では正規化後の「小文字」になったもので見出しを探すため、この大文字のものとマッチすることがありません。 @@ -76,7 +79,7 @@ Sudachi では、以下の文字正規化を行っています。挙動の詳細 - NFKC をつかった Unicode 正規化 - ただし、設定ファイル `rewrite.def` に定義される抑制、置換が優先 -ユーザー辞書の見出しへは、文字正規化は自動的には適用されません。これは、ユーザーが想定しづらい挙動を避けるためです。そのため、ユーザー辞書の作成者が文字正規化を意識して語を表記する必要があります。 +ユーザー辞書の解析用見出しへは、文字正規化は自動的には適用されません。これは、ユーザーが想定しづらい挙動を避けるためです。そのため、ユーザー辞書の作成者が文字正規化を意識して語を表記する必要があります。 ### 1 Left_Id: 左連接 ID @@ -112,19 +115,19 @@ Sudachi では、以下の文字正規化を行っています。挙動の詳細 ### 3 Cost: コスト -形態素解析に使用される見出し表記のコスト値です。 +形態素解析に使用される語のコスト値です。 "-32767 ~ 32767" までの整数値で指定できます。 -値を小さくするほど、登録した見出し表記が解析結果として出やすくなります。 +値を小さくするほど、登録した語が解析結果として出やすくなります。 なお、"-32768" を指定すると、ユーザー辞書読み込み時に自動推定した値を付与します。 名詞類の登録であれば、"5000 ~ 9000" を推奨 -### 4 Writing: 見出し (解析結果表示用) +### 4 Writing: 表記形 -この項目は使用されません。 -記載を省略することができます。 +[文字正規化](#文字正規化)を行う前の語の表記です。 -項目を省略しない場合は、空欄とするか、見出し表記と同じものを記述してください。 +項目の記載を省略することができます。 +項目が省略される、もしくは空欄の場合は、[解析用見出し](#0-surface-解析用見出し) を代用します。 ### 5 POS1: 品詞 1 @@ -176,7 +179,7 @@ POS1: 品詞 1 を参照してください。 ### 11 Reading_Form: 読み -見出し表記の読みを記述します。 +語の読みを記述します。 全角カタカナもしくは記号での記述を推奨します。 空欄としても構いません。 @@ -185,10 +188,10 @@ POS1: 品詞 1 を参照してください。 表記にぶれのある語に対して、その語の正規化形を指定するための情報です。 -対象となる語への[語参照](#語参照)もしくはその語の[見出し表記](#0-surface-見出し表記)を記述します。 +対象となる語への[語参照](#語参照)もしくはその語の[表記形](#4-writing-表記形)を記述します。 -見出し表記のみで記述された場合、対象となる語が記述されていない場合でも文字列のみを正規化形として登録します。 -「見出し表記=正規化表記」である場合は、空欄とすることができます。 +表記形のみで記述された場合、対象となる語が記述されていない場合でも文字列のみを正規化形として登録します。 +「表記形=正規化形」である場合は、空欄とすることができます。 ### 13 Dictionary_Form: 辞書形情報 @@ -202,7 +205,7 @@ POS1: 品詞 1 を参照してください。 ### 14 Mode: 分割タイプ この項目は使用されません。 -記載を省略することができます。 +項目の記載を省略することができます。 項目を省略しない場合は、語の分割単位タイプ (A / B / C) を記述します。 後述の分割情報を記述しない場合は空文字列でもかまいません。 @@ -225,7 +228,7 @@ POS1: 品詞 1 を参照してください。 ### 19 Split_C: C 単位分割情報 分割単位タイプ C よりも長い語句を C 単位への分割情報と共に登録する際に使用します。 -記載を省略することができます。 +項目の記載を省略することができます。 フォーマットは A 単位分割情報と同じです。 @@ -240,7 +243,7 @@ POS1: 品詞 1 を参照してください。 ### 18 Synonym_Groups: 同義語グループ ID Sudachi 同義語辞書における同義語グループ ID 情報です。 -記載を省略することができます。 +項目の記載を省略することができます。 対象となる同義語グループ ID を "/" (半角スラッシュ) で区切って記述します。 グループが存在しない場合は、空欄としてください。 @@ -248,7 +251,7 @@ Sudachi 同義語辞書における同義語グループ ID 情報です。 ### 20: User_Data: ユーザーデータ 語に対して任意の文字列を登録します。 -記載を省略することができます。 +項目の記載を省略することができます。 表記の長さは、32767 文字まで登録できます。 diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CompiledWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CompiledWordEntry.java index 1a06954c..8809f7a6 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CompiledWordEntry.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CompiledWordEntry.java @@ -22,7 +22,7 @@ import com.worksap.nlp.sudachi.dictionary.WordInfo; /** - * WordInfo wrapper for Lookup2.Entry interface. + * WordInfo wrapper for EntryLookup.Entry interface. * * Used to resolve wordref that references entry in the system dictionary * (during user dictinary build). diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/EntryLookup.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/EntryLookup.java index 76b2e174..08a81f79 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/EntryLookup.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/EntryLookup.java @@ -65,8 +65,8 @@ public String headword() { // entries private final List systemEntries; private final List userEntries; - // mapping to entries that have same surfaces - private final Map> bySurface; + // mapping to entries that have same headwords + private final Map> byHeadword; public EntryLookup(List systemEntries, List userEntries) { this.systemEntries = systemEntries; @@ -82,7 +82,7 @@ public EntryLookup(List systemEntries, List us List sublist = result.computeIfAbsent(e.headword(), x -> new ArrayList<>()); sublist.add(new EntryWithFlag(e, true)); } - bySurface = result; + byHeadword = result; } /** @@ -96,7 +96,7 @@ public EntryLookup(List systemEntries, List us * @return */ public EntryWithFlag byIndex(int index, boolean isUser) { - // if userEntries is empty (i.e. building system), ignore isUser flag + // if userEntries is empty (i.e. building system dict), ignore isUser flag if (isUser && !userEntries.isEmpty()) { return new EntryWithFlag(userEntries.get(index), true); } @@ -110,7 +110,7 @@ public EntryWithFlag byIndex(int index, boolean isUser) { * @return */ public List byHeadword(String headword) { - return bySurface.get(headword); + return byHeadword.get(headword); } /** @@ -119,6 +119,6 @@ public List byHeadword(String headword) { * @param e */ public void add(Entry e, boolean isUser) { - bySurface.computeIfAbsent(e.headword(), x -> new ArrayList<>()).add(new EntryWithFlag(e, isUser)); + byHeadword.computeIfAbsent(e.headword(), x -> new ArrayList<>()).add(new EntryWithFlag(e, isUser)); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index 11fb1c5a..d594bdce 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -120,7 +120,7 @@ public void read(String name, Reader data, POSTable posTable, short numLeft, sho offset += entry.computeExpectedSize(); checkOffset(offset); if (entry.shouldBeIndexed()) { - index.add(entry.headword, entry.pointer); + index.add(entry.surface, entry.pointer); } else { notIndexed.add(entry); } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index 9214c069..46d8a92c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -244,12 +244,12 @@ private WordRef getWordRef(List data, Column column, WordRef.Parser refP // because headword/triple ref may resolved to other entry. if (ref instanceof WordRef.Headword) { WordRef.Headword headword = (WordRef.Headword) ref; - if (headword.getHeadword().equals(entry.headword)) { + if (headword.getHeadword().equals(entry.headword())) { return null; } } else if (ref instanceof WordRef.Triple) { WordRef.Triple triple = (WordRef.Triple) ref; - if (triple.getHeadword().equals(entry.headword) && triple.getPosId() == entry.posId + if (triple.getHeadword().equals(entry.headword()) && triple.getPosId() == entry.posId && triple.getReading().equals(entry.reading)) { return null; } @@ -294,7 +294,9 @@ private short getPos(List data) { /** convert csv row to RawWordEntry */ private RawWordEntry convertEntry(List data) { RawWordEntry entry = new RawWordEntry(); - entry.headword = getNonEmpty(data, Column.SURFACE, true); + entry.surface = getNonEmpty(data, Column.SURFACE, true); + String writing = get(data, Column.WRITING, true); + entry.writing = writing.isEmpty() ? entry.surface : writing; entry.leftId = getShort(data, Column.LEFT_ID); entry.rightId = getShort(data, Column.RIGHT_ID); @@ -303,7 +305,7 @@ private RawWordEntry convertEntry(List data) { entry.reading = get(data, Column.READING_FORM, true); entry.posId = getPos(data); - // headword, pos, reading must be parsed before these. + // writing, pos, reading must be parsed before these to resolve wordref. entry.normalizedForm = getWordRef(data, Column.NORMALIZED_FORM, normRefParser, entry); entry.dictionaryForm = getWordRef(data, Column.DICTIONARY_FORM, dictRefParser, entry); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java index 7540c5aa..91f362e8 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java @@ -30,7 +30,8 @@ @SuppressWarnings("jol") public class RawWordEntry implements EntryLookup.Entry { int pointer; // wordid, compressed offset of this entry in the lexicon.WordEntries - String headword; + String surface; + String writing; short leftId; short rightId; short cost; @@ -91,7 +92,8 @@ public boolean matches(short posId, String reading) { @Override public String headword() { - return headword; + // use writing for entry lookup + return writing; } private void checkString(String value, String name) { @@ -104,7 +106,8 @@ private void checkString(String value, String name) { /** check if sudachi dictionary can handle this entry */ public void validate() { - checkString(headword, "headword"); + checkString(surface, "surface"); + checkString(writing, "writing"); checkString(reading, "reading"); if (normalizedForm instanceof WordRef.Headword) { checkString(((WordRef.Headword) normalizedForm).getHeadword(), "normalized form"); @@ -118,7 +121,8 @@ public void validate() { * storage to publish strings. */ public void publishStrings(StringStorage strings) { - strings.add(headword); + // surface is used only for indexing and is not necessary to store + strings.add(writing); strings.add(reading); if (normalizedForm instanceof WordRef.Headword) { WordRef.Headword normalized = (WordRef.Headword) normalizedForm; @@ -131,7 +135,8 @@ public void publishStrings(StringStorage strings) { */ public static RawWordEntry makeEmpty() { RawWordEntry entry = new RawWordEntry(); - entry.headword = ""; + entry.surface = ""; + entry.writing = ""; entry.leftId = -1; entry.rightId = -1; entry.cost = Short.MAX_VALUE; @@ -156,15 +161,20 @@ public static RawWordEntry makeEmpty() { */ public static RawWordEntry makePhantom(RawWordEntry base, String surface) { RawWordEntry entry = new RawWordEntry(); - entry.headword = surface; + // keep surface empty, phantom entry will only be accessed via normalized form + entry.surface = ""; + entry.writing = surface; + // phantom entry should not be used in the analysis entry.leftId = -1; entry.rightId = -1; entry.cost = Short.MAX_VALUE; + + // other data should be equivalent to the base entry entry.posId = base.posId; entry.reading = base.reading; - // normalizedForm = null: refer to itself entry.dictionaryForm = base.dictionaryForm; + entry.normalizedForm = null; // refer to itself entry.mode = base.mode; entry.aUnitSplit = base.aUnitSplit; entry.bUnitSplit = base.bUnitSplit; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java index 74c6a70e..a1181cfd 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java @@ -66,23 +66,17 @@ public int put(RawWordEntry entry) throws IOException { buf.putShort(entry.posId); // 2*4 = 8 bytes - buf.putInt(index.resolve(entry.headword).encode()); // surfacePtr - buf.putInt(index.resolve(entry.reading).encode()); // readingPtr + buf.putInt(index.resolve(entry.writing).encode()); // surface StringPtr + buf.putInt(index.resolve(entry.reading).encode()); // reading StringPtr int selfWordRef = isUser ? WordId.make(1, entry.pointer) : entry.pointer; - int normFormPtr = selfWordRef; - if (entry.normalizedForm != null) { - normFormPtr = entry.normalizedForm.resolve(lookup); - } - int dicFormPtr = selfWordRef; - if (entry.dictionaryForm != null) { - dicFormPtr = entry.dictionaryForm.resolve(lookup); - } - buf.putInt(normFormPtr); // normalized entry - buf.putInt(dicFormPtr); // dictionary form + int normFormPtr = entry.normalizedForm == null ? selfWordRef : entry.normalizedForm.resolve(lookup); + int dictFormPtr = entry.dictionaryForm == null ? selfWordRef : entry.dictionaryForm.resolve(lookup); + buf.putInt(normFormPtr); // normalized form WordRef + buf.putInt(dictFormPtr); // dictionary form WordRef // 8 + 4*4 = 24 bytes // length can't be more than ~4k utf-16 code units so the cast is safe - short utf8Len = (short) StringUtil.countUtf8Bytes(entry.headword); + short utf8Len = (short) StringUtil.countUtf8Bytes(entry.surface); byte cSplitLen = resolveWordRefList(entry.cUnitSplit, null, cSplits); byte bSplitLen = resolveWordRefList(entry.bUnitSplit, entry.cUnitSplit, bSplits); byte aSplitLen = resolveWordRefList(entry.aUnitSplit, entry.bUnitSplit, aSplits); diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt index 071dfe60..c4a5ee93 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt @@ -45,7 +45,8 @@ class RawLexiconReaderTest { fun legacyCsvWithMinimumFields() { val reader = RawLexiconReader(csvfile("legacy-minimum.csv"), POSTable()) assertNotNull(reader.nextEntry()).let { e -> - assertEquals("東京都", e.headword) + assertEquals("東京都", e.surface) + assertEquals("東京都", e.headword()) assertEquals("トウキョウト", e.reading) assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(9, false)), e.wordStructure) assertEquals(0, e.synonymGroups.length()) @@ -59,7 +60,8 @@ class RawLexiconReaderTest { fun legacyCsvWithAllFields() { val reader = RawLexiconReader(csvfile("legacy-full.csv"), POSTable()) assertNotNull(reader.nextEntry()).let { e -> - assertEquals("東京都", e.headword) + assertEquals("東京都", e.surface) + assertEquals("東京都", e.headword()) assertEquals("トウキョウト", e.reading) assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(9, false)), e.wordStructure) assertEquals(Ints.wrap(intArrayOf(6, 7)), e.synonymGroups) @@ -73,7 +75,8 @@ class RawLexiconReaderTest { fun headerCsvMinimumFields() { val reader = RawLexiconReader(csvfile("headers-minimum.csv"), POSTable()) assertNotNull(reader.nextEntry()).let { e -> - assertEquals("東京都", e.headword) + assertEquals("東京都", e.surface) + assertEquals("東京都", e.headword()) // surface is used for missing writing assertEquals("トウキョウト", e.reading) assertEquals( listOf(WordRef.Triple("東京", 0, "トウキョウ"), WordRef.Triple("都", 1, "ト")), e.aUnitSplit) @@ -90,7 +93,8 @@ class RawLexiconReaderTest { fun headerCsvAllFields() { val reader = RawLexiconReader(csvfile("headers-all.csv"), POSTable()) assertNotNull(reader.nextEntry()).let { e -> - assertEquals("東京都", e.headword) + assertEquals("東京都", e.surface) + assertEquals("東京都", e.headword()) assertEquals("トウキョウト", e.reading) assertEquals( listOf(WordRef.Triple("東京", 0, "トウキョウ"), WordRef.Triple("都", 1, "ト")), e.aUnitSplit) @@ -107,6 +111,22 @@ class RawLexiconReaderTest { assertNull(reader.nextEntry()) } + @Test + fun parseWriting() { + val text = + """Surface,LeftId,RightId,Cost,writing,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure +abc,0,0,1000,AbC,0,トウキョウト,,,,,""" + val posTable = POSTable() + posTable.getId(POS("a", "a", "a", "a", "a", "0")) + + val reader = RawLexiconReader(csvtext(text), posTable) + assertNotNull(reader.nextEntry()).let { e -> + assertEquals("abc", e.surface) + assertEquals("AbC", e.headword()) + } + assertNull(reader.nextEntry()) + } + @Test fun failMissingRequiredEntry() { // pos1-6 are not required (because of posId), but must be used as a set @@ -127,7 +147,7 @@ class RawLexiconReaderTest { } @Test - fun posIdColumn() { + fun posIdOnly() { val text = """Surface,LeftId,RightId,Cost,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure 東京都,6,8,5320,0,トウキョウト,,,,,""" @@ -152,7 +172,6 @@ class RawLexiconReaderTest { reader.nextEntry() } } - @Test fun posIdAndParts() { val text = @@ -242,6 +261,13 @@ ${oversizeWord},6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウ val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } + run { + val text = + """Surface,LeftId,RightId,Cost,writing,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure +東京都,6,8,5320,${oversizeWord},名詞,固有名詞,地名,一般,*,*,トウキョウト,,,1,,,""" + val reader = RawLexiconReader(csvtext(text), POSTable()) + assertFails { reader.nextEntry() } + } run { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure @@ -259,7 +285,7 @@ ${oversizeWord},6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウ } @Test - fun failEmptyHeadword() { + fun failEmptySurface() { val text = """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure ,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,""" diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv index 3524de33..d5180268 100644 --- a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv @@ -1,3 +1,3 @@ -Surface,LeftId,RightId,Cost,pos_id,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,mode,splita,splitb,splitc,wordstructure,synonymgroups,userdata -東京都,6,8,5320,,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,B,"東京,0,トウキョウ/都,1,ト","東京,0,トウキョウ/都,2,ト","東京,0,トウキョウ/都,3,ト","東京,0,トウキョウ/都,4,ト",8/9,10 -行く,4,4,5105,,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,,,A,,,,,, +Surface,LeftId,RightId,Cost,pos_id,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,mode,splita,splitb,splitc,wordstructure,synonymgroups,userdata,Writing +東京都,6,8,5320,,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,B,"東京,0,トウキョウ/都,1,ト","東京,0,トウキョウ/都,2,ト","東京,0,トウキョウ/都,3,ト","東京,0,トウキョウ/都,4,ト",8/9,10,東京都 +行く,4,4,5105,,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,,,A,,,,,,, diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/legacy-full.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/legacy-full.csv index caed502c..3271b62d 100644 --- a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/legacy-full.csv +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/legacy-full.csv @@ -1 +1 @@ -東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,*,5/9,6/7,8/9,10, \ No newline at end of file +東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,*,5/9,6/7,8/9,10, diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/legacy-minimum.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/legacy-minimum.csv index 8ee89d59..e3e1c9dc 100644 --- a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/legacy-minimum.csv +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/legacy-minimum.csv @@ -1 +1 @@ -東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,*,5/9,* \ No newline at end of file +東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,*,5/9,* From 419a1a1dbbb82758e6b9adb55211bfa1d6aa4eb4 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 18 Nov 2024 11:19:55 +0900 Subject: [PATCH 91/94] try to satisfy sonar a bit --- .../sudachi/dictionary/DictionaryPrinter.java | 10 +++---- .../sudachi/dictionary/build/POSTable.java | 4 +-- .../sudachi/dictionary/build/RawLexicon.java | 27 ++++++++--------- .../dictionary/build/RawLexiconReader.java | 18 +++++------ .../dictionary/build/RawWordEntry.java | 17 +++++------ .../dictionary/build/WordEntryLayout.java | 8 ++--- .../nlp/sudachi/dictionary/build/WordRef.java | 26 ++++++++-------- .../dictionary/DictionaryPrinterTest.kt | 5 ---- .../sudachi/dictionary/build/MemChannel.kt | 1 - .../dictionary/build/RawLexiconReaderTest.kt | 30 ++++++++++++------- .../sudachi/dictionary/build/UserDicTest.kt | 1 - 11 files changed, 74 insertions(+), 73 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index 4b996e08..37f27ee0 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -221,8 +221,8 @@ void lastField(String value) { } /** - * encode word entry pointed by the wordId as WordRef.Triple. If it points to - * self, return empty string. + * encode word entry pointed by the wordId as WordRef.RefByTriple. If it points + * to self, return empty string. */ String wordRef(int wordId, int reference) { if (wordId == reference) { @@ -231,7 +231,7 @@ String wordRef(int wordId, int reference) { return wordRef(wordId); } - /** encode word entry pointed by the wordId as WordRef.Triple. */ + /** encode word entry pointed by the wordId as WordRef.RefByTriple. */ String wordRef(int wordId) { WordInfo info = lex.getWordInfo(wordId); int dic = WordId.dic(wordId); @@ -254,7 +254,7 @@ String wordRef(int wordId) { .collect(Collectors.joining(String.valueOf(WordRef.Parser.WORDREF_DELIMITER))); } - /** encode word entry pointed by the wordId as WordRef.Headword. */ + /** encode word entry pointed by the wordId as WordRef.RefByHeadword. */ String wordRefHeadword(int wordId, int reference) { if (wordId == reference) { return ""; @@ -290,7 +290,7 @@ private String maybeEscapeString(String value) { return "\"" + value + "\""; } - /** escape WordRef.Triple part. */ + /** escape WordRef.RefByTriple part. */ private String maybeEscapeRefPart(String value) { boolean hasDelimiter = hasCh(value, RawLexiconReader.LIST_DELIMITER); boolean hasJoiner = hasCh(value, WordRef.Parser.WORDREF_DELIMITER); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java index ea2f7b1d..e278d894 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java @@ -214,8 +214,8 @@ public static class POSCSVReader { private List cachedRow; private boolean hasIdColumn = true; - public static final Column[] PART_COLUMNS = { Column.POS1, Column.POS2, Column.POS3, Column.POS4, Column.POS5, - Column.POS6 }; + protected static final Column[] PART_COLUMNS = { Column.POS1, Column.POS2, Column.POS3, Column.POS4, + Column.POS5, Column.POS6 }; public enum Column { POS_ID(false), POS1(true), POS2(true), POS3(true), POS4(true), POS5(true), POS6(true); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java index d594bdce..8911407d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexicon.java @@ -202,23 +202,22 @@ private Void writeEntries(BlockOutput blockOutput) throws IOException { * @return 1 if phantom entry added, 0 otherwise */ private int addPhantomEntries(RawWordEntry entry, List list, EntryLookup lookup) { - if (entry.normalizedForm instanceof WordRef.Headword) { - WordRef.Headword ref = (WordRef.Headword) entry.normalizedForm; - if (lookup.byHeadword(ref.getHeadword()) != null) { - return 0; - } + if (!(entry.normalizedFormRef instanceof WordRef.RefByHeadword)) { + return 0; + } - RawWordEntry phantom = RawWordEntry.makePhantom(entry, ref.getHeadword()); - RawWordEntry last = list.get(list.size() - 1); - phantom.pointer = RawLexicon - .pointer((long) WordInfoList.wordId2offset(last.pointer) + last.computeExpectedSize()); - list.add(phantom); - lookup.add(phantom, isUser); - nPhantomEntries += 1; - return 1; - } else { + WordRef.RefByHeadword ref = (WordRef.RefByHeadword) entry.normalizedFormRef; + if (lookup.byHeadword(ref.getHeadword()) != null) { return 0; } + + RawWordEntry phantom = RawWordEntry.makePhantom(entry, ref.getHeadword()); + RawWordEntry last = list.get(list.size() - 1); + phantom.pointer = pointer((long) WordInfoList.wordId2offset(last.pointer) + last.computeExpectedSize()); + list.add(phantom); + lookup.add(phantom, isUser); + nPhantomEntries += 1; + return 1; } /** @return number of entries in the TRIE index */ diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java index 46d8a92c..4af28861 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReader.java @@ -242,15 +242,15 @@ private WordRef getWordRef(List data, Column column, WordRef.Parser refP // if parsed ref seems to refering current entry, return self-reference (null), // because headword/triple ref may resolved to other entry. - if (ref instanceof WordRef.Headword) { - WordRef.Headword headword = (WordRef.Headword) ref; - if (headword.getHeadword().equals(entry.headword())) { + if (ref instanceof WordRef.RefByHeadword) { + WordRef.RefByHeadword refbyHeadword = (WordRef.RefByHeadword) ref; + if (refbyHeadword.getHeadword().equals(entry.headword())) { return null; } - } else if (ref instanceof WordRef.Triple) { - WordRef.Triple triple = (WordRef.Triple) ref; - if (triple.getHeadword().equals(entry.headword()) && triple.getPosId() == entry.posId - && triple.getReading().equals(entry.reading)) { + } else if (ref instanceof WordRef.RefByTriple) { + WordRef.RefByTriple refbyTriple = (WordRef.RefByTriple) ref; + if (refbyTriple.getHeadword().equals(entry.headword()) && refbyTriple.getPosId() == entry.posId + && refbyTriple.getReading().equals(entry.reading)) { return null; } } @@ -306,8 +306,8 @@ private RawWordEntry convertEntry(List data) { entry.posId = getPos(data); // writing, pos, reading must be parsed before these to resolve wordref. - entry.normalizedForm = getWordRef(data, Column.NORMALIZED_FORM, normRefParser, entry); - entry.dictionaryForm = getWordRef(data, Column.DICTIONARY_FORM, dictRefParser, entry); + entry.normalizedFormRef = getWordRef(data, Column.NORMALIZED_FORM, normRefParser, entry); + entry.dictionaryFormRef = getWordRef(data, Column.DICTIONARY_FORM, dictRefParser, entry); entry.mode = get(data, Column.MODE, false); entry.aUnitSplit = getWordRefs(data, Column.SPLIT_A, splitParser); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java index 91f362e8..fcac6487 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java @@ -37,8 +37,8 @@ public class RawWordEntry implements EntryLookup.Entry { short cost; short posId; String reading; - WordRef normalizedForm; - WordRef dictionaryForm; + WordRef normalizedFormRef; + WordRef dictionaryFormRef; String mode; List aUnitSplit; List bUnitSplit; @@ -109,8 +109,8 @@ public void validate() { checkString(surface, "surface"); checkString(writing, "writing"); checkString(reading, "reading"); - if (normalizedForm instanceof WordRef.Headword) { - checkString(((WordRef.Headword) normalizedForm).getHeadword(), "normalized form"); + if (normalizedFormRef instanceof WordRef.RefByHeadword) { + checkString(((WordRef.RefByHeadword) normalizedFormRef).getHeadword(), "normalized form"); } } @@ -124,9 +124,8 @@ public void publishStrings(StringStorage strings) { // surface is used only for indexing and is not necessary to store strings.add(writing); strings.add(reading); - if (normalizedForm instanceof WordRef.Headword) { - WordRef.Headword normalized = (WordRef.Headword) normalizedForm; - strings.add(normalized.getHeadword()); + if (normalizedFormRef instanceof WordRef.RefByHeadword) { + strings.add(((WordRef.RefByHeadword) normalizedFormRef).getHeadword()); } } @@ -173,8 +172,8 @@ public static RawWordEntry makePhantom(RawWordEntry base, String surface) { // other data should be equivalent to the base entry entry.posId = base.posId; entry.reading = base.reading; - entry.dictionaryForm = base.dictionaryForm; - entry.normalizedForm = null; // refer to itself + entry.dictionaryFormRef = base.dictionaryFormRef; + entry.normalizedFormRef = null; // refer to itself entry.mode = base.mode; entry.aUnitSplit = base.aUnitSplit; entry.bUnitSplit = base.bUnitSplit; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java index a1181cfd..22843005 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java @@ -68,9 +68,9 @@ public int put(RawWordEntry entry) throws IOException { buf.putInt(index.resolve(entry.writing).encode()); // surface StringPtr buf.putInt(index.resolve(entry.reading).encode()); // reading StringPtr - int selfWordRef = isUser ? WordId.make(1, entry.pointer) : entry.pointer; - int normFormPtr = entry.normalizedForm == null ? selfWordRef : entry.normalizedForm.resolve(lookup); - int dictFormPtr = entry.dictionaryForm == null ? selfWordRef : entry.dictionaryForm.resolve(lookup); + int selfPtr = isUser ? WordId.make(1, entry.pointer) : entry.pointer; + int normFormPtr = entry.normalizedFormRef == null ? selfPtr : entry.normalizedFormRef.resolve(lookup); + int dictFormPtr = entry.dictionaryFormRef == null ? selfPtr : entry.dictionaryFormRef.resolve(lookup); buf.putInt(normFormPtr); // normalized form WordRef buf.putInt(dictFormPtr); // dictionary form WordRef // 8 + 4*4 = 24 bytes @@ -107,7 +107,7 @@ public int put(RawWordEntry entry) throws IOException { } int position = this.buffer.alignTo(WordInfoList.OFFSET_ALIGNMENT); - return RawLexicon.pointer(position); + return WordInfoList.offset2wordId(position); } /** diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java index 69e59eec..4a33b560 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java @@ -50,11 +50,11 @@ public int intoWordRef(EntryLookup.EntryWithFlag entry) { /** * Reference written by line number of the lexicon csv file. */ - public static final class LineNo extends WordRef { + public static final class RefByLineNo extends WordRef { private final int line; private final boolean isUser; - public LineNo(int line, boolean isUser) { + public RefByLineNo(int line, boolean isUser) { this.line = line; this.isUser = isUser; } @@ -79,7 +79,7 @@ public boolean equals(Object other) { return true; if (other == null || getClass() != other.getClass()) return false; - LineNo o = (LineNo) other; + RefByLineNo o = (RefByLineNo) other; return (line == o.line) && (isUser == o.isUser); } @@ -93,10 +93,10 @@ public int hashCode() { /** * Reference written by surface. */ - public static final class Headword extends WordRef { + public static final class RefByHeadword extends WordRef { private final String headword; - public Headword(String headword) { + public RefByHeadword(String headword) { this.headword = headword; } @@ -121,7 +121,7 @@ public boolean equals(Object other) { return true; if (other == null || getClass() != other.getClass()) return false; - Headword o = (Headword) other; + RefByHeadword o = (RefByHeadword) other; return headword.equals(o.headword); } @@ -134,12 +134,12 @@ public int hashCode() { /** * Reference written by surface-pos-reading tuple. */ - public static final class Triple extends WordRef { + public static final class RefByTriple extends WordRef { private final String headword; private final short posId; private final String reading; - public Triple(String headword, short posId, String reading) { + public RefByTriple(String headword, short posId, String reading) { this.headword = headword; this.posId = posId; this.reading = reading; @@ -182,7 +182,7 @@ public boolean equals(Object other) { return true; if (other == null || getClass() != other.getClass()) return false; - Triple o = (Triple) other; + RefByTriple o = (RefByTriple) other; return (headword.equals(o.headword)) && (posId == o.posId) && (reading.equals(o.reading)); } @@ -225,7 +225,7 @@ public WordRef parse(String text) { boolean isUser = text.charAt(0) == 'U'; int offset = isUser ? 1 : 0; int lineNum = Integer.parseInt(text.substring(offset)); - return new LineNo(lineNum, isUser); + return new RefByLineNo(lineNum, isUser); } if (StringUtil.count(text, WORDREF_DELIMITER) == 7) { @@ -238,7 +238,7 @@ public WordRef parse(String text) { POS pos = new POS(posElems); short posId = posTable.getId(pos); String reading = Unescape.unescape(cols[7]); - return new Triple(headword, posId, reading); + return new RefByTriple(headword, posId, reading); } if (StringUtil.count(text, WORDREF_DELIMITER) == 2) { @@ -246,11 +246,11 @@ public WordRef parse(String text) { String headword = Unescape.unescape(cols[0]); short posId = Short.parseShort(cols[1]); String reading = Unescape.unescape(cols[2]); - return new Triple(headword, posId, reading); + return new RefByTriple(headword, posId, reading); } if (allowHeadword) { - return new Headword(Unescape.unescape(text)); + return new RefByHeadword(Unescape.unescape(text)); } else { throw new IllegalArgumentException(String.format("invalid word reference: %s", text)); } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt index e20f42f4..d142dc75 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt @@ -26,7 +26,6 @@ import java.io.ByteArrayOutputStream import java.io.FileOutputStream import java.io.OutputStream import java.io.PrintStream -import java.nio.file.Files import java.nio.file.Path import java.util.Arrays import kotlin.io.path.createTempDirectory @@ -183,8 +182,6 @@ class DictionaryPrinterTest { printDictionary(output1, "system.dic") output1.close() - val printed = Files.readString(lexfile).split(System.lineSeparator()) - val dicfile2 = tempDir.resolve("system.dic2") val reload = MemChannel() DicBuilder.system().matrix(res("/dict/matrix.def")).lexicon(lexfile).build(reload) @@ -234,8 +231,6 @@ class DictionaryPrinterTest { printDictionary(output1, "user.dic", TestDictionary.systemDict) output1.close() - val printed = Files.readString(lexfile).split(System.lineSeparator()) - val dicfile2 = tempDir.resolve("user.dic2") val reload = MemChannel() DicBuilder.user().system(TestDictionary.systemDict).lexicon(lexfile).build(reload) diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/MemChannel.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/MemChannel.kt index f609ab7e..46c6f709 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/MemChannel.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/MemChannel.kt @@ -50,7 +50,6 @@ class MemChannel(bufSize: Int = 1024 * 1024) : SeekableByteChannel { val remaining = src!!.remaining() reserve(remaining) buffer.put(src) - val pos = buffer.position().toLong() return remaining } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt index c4a5ee93..b5152226 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt @@ -48,7 +48,8 @@ class RawLexiconReaderTest { assertEquals("東京都", e.surface) assertEquals("東京都", e.headword()) assertEquals("トウキョウト", e.reading) - assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(9, false)), e.wordStructure) + assertEquals( + listOf(WordRef.RefByLineNo(5, false), WordRef.RefByLineNo(9, false)), e.wordStructure) assertEquals(0, e.synonymGroups.length()) assertTrue(e.cUnitSplit.isEmpty()) assertEquals("", e.userData) @@ -63,9 +64,11 @@ class RawLexiconReaderTest { assertEquals("東京都", e.surface) assertEquals("東京都", e.headword()) assertEquals("トウキョウト", e.reading) - assertEquals(listOf(WordRef.LineNo(5, false), WordRef.LineNo(9, false)), e.wordStructure) + assertEquals( + listOf(WordRef.RefByLineNo(5, false), WordRef.RefByLineNo(9, false)), e.wordStructure) assertEquals(Ints.wrap(intArrayOf(6, 7)), e.synonymGroups) - assertEquals(listOf(WordRef.LineNo(8, false), WordRef.LineNo(9, false)), e.cUnitSplit) + assertEquals( + listOf(WordRef.RefByLineNo(8, false), WordRef.RefByLineNo(9, false)), e.cUnitSplit) assertEquals("10", e.userData) } assertNull(reader.nextEntry()) @@ -79,11 +82,14 @@ class RawLexiconReaderTest { assertEquals("東京都", e.headword()) // surface is used for missing writing assertEquals("トウキョウト", e.reading) assertEquals( - listOf(WordRef.Triple("東京", 0, "トウキョウ"), WordRef.Triple("都", 1, "ト")), e.aUnitSplit) + listOf(WordRef.RefByTriple("東京", 0, "トウキョウ"), WordRef.RefByTriple("都", 1, "ト")), + e.aUnitSplit) assertEquals( - listOf(WordRef.Triple("東京", 0, "トウキョウ"), WordRef.Triple("都", 2, "ト")), e.bUnitSplit) + listOf(WordRef.RefByTriple("東京", 0, "トウキョウ"), WordRef.RefByTriple("都", 2, "ト")), + e.bUnitSplit) assertEquals( - listOf(WordRef.Triple("東京", 0, "トウキョウ"), WordRef.Triple("都", 3, "ト")), e.wordStructure) + listOf(WordRef.RefByTriple("東京", 0, "トウキョウ"), WordRef.RefByTriple("都", 3, "ト")), + e.wordStructure) } assertNotNull(reader.nextEntry()) assertNull(reader.nextEntry()) @@ -97,13 +103,17 @@ class RawLexiconReaderTest { assertEquals("東京都", e.headword()) assertEquals("トウキョウト", e.reading) assertEquals( - listOf(WordRef.Triple("東京", 0, "トウキョウ"), WordRef.Triple("都", 1, "ト")), e.aUnitSplit) + listOf(WordRef.RefByTriple("東京", 0, "トウキョウ"), WordRef.RefByTriple("都", 1, "ト")), + e.aUnitSplit) assertEquals( - listOf(WordRef.Triple("東京", 0, "トウキョウ"), WordRef.Triple("都", 2, "ト")), e.bUnitSplit) + listOf(WordRef.RefByTriple("東京", 0, "トウキョウ"), WordRef.RefByTriple("都", 2, "ト")), + e.bUnitSplit) assertEquals( - listOf(WordRef.Triple("東京", 0, "トウキョウ"), WordRef.Triple("都", 3, "ト")), e.cUnitSplit) + listOf(WordRef.RefByTriple("東京", 0, "トウキョウ"), WordRef.RefByTriple("都", 3, "ト")), + e.cUnitSplit) assertEquals( - listOf(WordRef.Triple("東京", 0, "トウキョウ"), WordRef.Triple("都", 4, "ト")), e.wordStructure) + listOf(WordRef.RefByTriple("東京", 0, "トウキョウ"), WordRef.RefByTriple("都", 4, "ト")), + e.wordStructure) assertEquals(Ints.wrap(intArrayOf(8, 9)), e.synonymGroups) assertEquals("10", e.userData) } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt index e8db5746..8d64de2c 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/UserDicTest.kt @@ -157,7 +157,6 @@ class UserDicTest { @Test fun variousWordReferences() { - val dictData = MemChannel() val dic = TestDic() .systemUrl(javaClass.getResource("wordref.csv")) From af781bbb2e96e5fae4d778bde6842f4b1620a783 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 19 Nov 2024 09:48:23 +0900 Subject: [PATCH 92/94] rename surface -> indexForm and writing -> headword --- docs/user_dict.md | 25 +++--- .../worksap/nlp/sudachi/LatticeNodeImpl.java | 12 +-- .../sudachi/dictionary/DictionaryPrinter.java | 40 +++++---- .../dictionary/DoubleArrayLexicon.java | 6 +- .../nlp/sudachi/dictionary/WordInfo.java | 38 ++++----- .../nlp/sudachi/dictionary/WordInfoList.java | 4 +- .../dictionary/build/CompiledWordEntry.java | 2 +- .../sudachi/dictionary/build/EntryLookup.java | 4 +- .../nlp/sudachi/dictionary/build/Index.java | 11 +-- .../sudachi/dictionary/build/RawLexicon.java | 4 +- .../dictionary/build/RawLexiconReader.java | 10 +-- .../dictionary/build/RawWordEntry.java | 28 +++---- .../dictionary/build/WordEntryLayout.java | 4 +- .../nlp/sudachi/dictionary/build/WordRef.java | 15 ++-- .../dictionary/DictionaryBuilderTest.kt | 8 +- .../dictionary/DictionaryPrinterTest.kt | 38 +++++---- .../dictionary/DoubleArrayLexiconTest.kt | 24 +++--- .../dictionary/UserDictionaryBuilderTest.java | 4 +- .../dictionary/build/RawLexiconReaderTest.kt | 62 +++++++------- .../sudachi/dictionary/build/headers-all.csv | 2 +- .../dictionary/build/headers-minimum.csv | 2 +- .../sudachi/dictionary/build/wordref-user.csv | 2 +- .../nlp/sudachi/dictionary/build/wordref.csv | 2 +- src/test/resources/dict/lex.csv | 84 +++++++++---------- src/test/resources/dict/user.csv | 2 +- src/test/resources/dict/user2.csv | 2 +- 26 files changed, 226 insertions(+), 209 deletions(-) diff --git a/docs/user_dict.md b/docs/user_dict.md index 8f9c0078..b33e578b 100644 --- a/docs/user_dict.md +++ b/docs/user_dict.md @@ -8,11 +8,11 @@ Sudachi ではユーザー辞書をもちいて、システム辞書で不足し このファイルは、CSV 形式 (RFC 4180) で保存します。文字コードは、UTF-8 を使用します。 ユーザー辞書ソースに記載できる項目は以下の通りです。 -- [0 Surface: 解析用見出し](#0-surface-解析用見出し) +- [0 Index_Form: 解析用表記](#0-index_form-解析用表記) - [1 Left_Id: 左連接 ID](#1-left_id-左連接-id) - [2 Right_Id: 右連接 ID](#2-right_id-右連接-id) - [3 Cost: コスト](#3-cost-コスト) -- [4 Writing: 表記形](#4-writing-表記形) +- [4 Headword: 見出し表記](#4-headword-見出し表記) - [5 POS1: 品詞 1](#5-pos1-品詞-1) - [6 POS2: 品詞 2](#6-pos2-品詞-2) - [7 POS3: 品詞 3](#7-pos3-品詞-3) @@ -52,7 +52,7 @@ Sudachi ではユーザー辞書をもちいて、システム辞書で不足し 一部の項目では、辞書内の他の語への参照を記述することがあります。 以下ではこれを語参照と呼称します。 -語参照は対象語の「[表記形](#4-writing-表記形), 品詞, [読み](#11-reading_form-読み)」を "," (カンマ) で区切った文字列で記述します。 +語参照は対象語の「[見出し表記](#4-headword-見出し表記), 品詞, [読み](#11-reading_form-読み)」を "," (カンマ) で区切った文字列で記述します。 品詞は [POS1-POS6 の6つ組](#5-pos1-品詞-1) もしくは [POS_Id](#21-pos_id-品詞-id) のどちらかを使用します。 例: `東京,名詞,固有名詞,地名,一般,*,*,トウキョウ`, `東京,1349,トウキョウ` @@ -62,16 +62,16 @@ Sudachi ではユーザー辞書をもちいて、システム辞書で不足し 語参照の対象語は別途記述されている必要があります。 対象語がシステム辞書内にあるかユーザー辞書内にあるかは自動的に判別します(システム辞書のものが優先されます)。 -### 0 Surface: 解析用見出し +### 0 Index_Form: 解析用表記 -形態素解析に使用される見出し表記です。 +形態素解析に使用される見出しの表記です。 表記の長さは、255 文字まで登録できます。 #### 文字正規化 -解析用見出しは、「Sudachi の文字正規化がおこなわれた後の形」で登録してください。 +解析用表記は、「Sudachi の文字正規化がおこなわれた後の形」で登録してください。 -Sudachi では、文字正規化が行われた**後に**見出しを引きます。そのため、「正規化後に現れない形」で見出しが表記されている場合、その語はどのような場合でもマッチすることがありません。例えば、「ラテン文字の大文字」で見出しを表記した場合、Sudachi 内部では正規化後の「小文字」になったもので見出しを探すため、この大文字のものとマッチすることがありません。 +Sudachi では、文字正規化が行われた**後に**見出しを引きます。そのため、「正規化後に現れない形」で解析用表記が記述されている場合、その語はどのような場合でもマッチすることがありません。例えば、「ラテン文字の大文字」で解析用表記を記述した場合、解析時には Sudachi 内部では正規化後の「小文字」になったもので見出しを探すため、この大文字のものとマッチすることがありません。 Sudachi では、以下の文字正規化を行っています。挙動の詳細は、[Sudachi ドキュメントの該当箇所](https://github.com/WorksApplications/Sudachi#%E6%96%87%E5%AD%97%E6%AD%A3%E8%A6%8F%E5%8C%96)を参照してください。 @@ -79,7 +79,7 @@ Sudachi では、以下の文字正規化を行っています。挙動の詳細 - NFKC をつかった Unicode 正規化 - ただし、設定ファイル `rewrite.def` に定義される抑制、置換が優先 -ユーザー辞書の解析用見出しへは、文字正規化は自動的には適用されません。これは、ユーザーが想定しづらい挙動を避けるためです。そのため、ユーザー辞書の作成者が文字正規化を意識して語を表記する必要があります。 +ユーザー辞書の解析用表記へは、文字正規化は自動的には適用されません。これは、ユーザーが想定しづらい挙動を避けるためです。そのため、ユーザー辞書の作成者が文字正規化を意識して語を表記する必要があります。 ### 1 Left_Id: 左連接 ID @@ -122,12 +122,13 @@ Sudachi では、以下の文字正規化を行っています。挙動の詳細 名詞類の登録であれば、"5000 ~ 9000" を推奨 -### 4 Writing: 表記形 +### 4 Headword: 見出し表記 -[文字正規化](#文字正規化)を行う前の語の表記です。 +[文字正規化](#文字正規化)を行う前の見出しの表記です。 +語参照の解決時や正規化形にはこちらが使用されます。 項目の記載を省略することができます。 -項目が省略される、もしくは空欄の場合は、[解析用見出し](#0-surface-解析用見出し) を代用します。 +項目が省略される、もしくは空欄の場合は、[解析用表記](#0-index_form-解析用表記) を代用します。 ### 5 POS1: 品詞 1 @@ -188,7 +189,7 @@ POS1: 品詞 1 を参照してください。 表記にぶれのある語に対して、その語の正規化形を指定するための情報です。 -対象となる語への[語参照](#語参照)もしくはその語の[表記形](#4-writing-表記形)を記述します。 +対象となる語への[語参照](#語参照)もしくはその語の[見出し表記](#4-headword-見出し表記)を記述します。 表記形のみで記述された場合、対象となる語が記述されていない場合でも文字列のみを正規化形として登録します。 「表記形=正規化形」である場合は、空欄とすることができます。 diff --git a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java index 5b08960f..e9fa22aa 100644 --- a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java @@ -283,9 +283,9 @@ public String getSurface(LatticeNodeImpl node) { String s = surface; if (s == null) { WordInfo wi = node.getWordInfo(); - int surfacePtr = wi.getSurface(); + int headwordPtr = wi.getHeadword(); int dic = WordId.dic(node.getWordId()); - s = lexicon.string(dic, surfacePtr); + s = lexicon.string(dic, headwordPtr); surface = s; } return s; @@ -309,8 +309,8 @@ public String getNormalizedForm(LatticeNodeImpl node) { WordInfo wi = node.getWordInfo(); int wordref = wi.getNormalizedForm(); int dic = WordId.refDic(wordref, WordId.dic(node.wordId)); - int surfacePtr = lexicon.wordInfos(dic).surfacePtr(WordId.word(wordref)); - s = lexicon.string(dic, surfacePtr); + int headwordPtr = lexicon.wordInfos(dic).headwordPtr(WordId.word(wordref)); + s = lexicon.string(dic, headwordPtr); normalizedForm = s; } return s; @@ -322,8 +322,8 @@ public String getDictionaryForm(LatticeNodeImpl node) { WordInfo wi = node.getWordInfo(); int wordref = wi.getDictionaryForm(); int dic = WordId.refDic(wordref, WordId.dic(node.wordId)); - int surfacePtr = lexicon.wordInfos(dic).surfacePtr(WordId.word(wordref)); - s = lexicon.string(dic, surfacePtr); + int headwordPtr = lexicon.wordInfos(dic).headwordPtr(WordId.word(wordref)); + s = lexicon.string(dic, headwordPtr); dictionaryForm = s; } return s; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index 37f27ee0..835e112c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -60,8 +60,8 @@ public enum POSMode { /** * WordRef print mode * - * TRIPLE_PARTS: print as (surface, pos1, .., pos6, reading) tuple. TRIPLE_ID: - * print as (surface, pos-id, reading) tuple. + * TRIPLE_PARTS: print as (headword, pos1, .., pos6, reading) tuple. TRIPLE_ID: + * print as (headword, pos-id, reading) tuple. */ public enum WordRefMode { TRIPLE_PARTS, TRIPLE_ID; @@ -89,7 +89,7 @@ public enum WordRefMode { // in order to output dictionary entries in in-dictionary order we need to sort // them. iterator over them will get them not in the sorted order, but grouped - // by surface (and sorted in groups). + // by index-form (and sorted in groups). DoubleArrayLexicon targetLex = dic.getLexicon(); Ints allIds = new Ints(targetLex.size()); Iterator ids = targetLex.wordIds(0); @@ -138,7 +138,8 @@ void printHeader() { } List headerColumns = Stream - .of(Arrays.asList(Column.SURFACE, Column.LEFT_ID, Column.RIGHT_ID, Column.COST), posColumns, + .of(Arrays.asList(Column.INDEX_FORM, Column.LEFT_ID, Column.RIGHT_ID, Column.COST, Column.HEADWORD), + posColumns, Arrays.asList(Column.READING_FORM, Column.NORMALIZED_FORM, Column.DICTIONARY_FORM, Column.SPLIT_A, Column.SPLIT_B, Column.SPLIT_C, Column.WORD_STRUCTURE, Column.SYNONYM_GROUPS, Column.USER_DATA)) @@ -173,16 +174,20 @@ private void printEntries() { void printEntry(int wordId) { int dic = WordId.dic(wordId); WordInfo info = lex.getWordInfo(wordId); + + String headword = lex.string(dic, info.getHeadword()); + String indexForm = headword; // TODO: need normalization + field(indexForm); + + long params = lex.parameters(wordId); + field(WordParameters.leftId(params)); + field(WordParameters.rightId(params)); + field(WordParameters.cost(params)); + + field(headword.equals(indexForm) ? "" : headword); + short posId = info.getPOSId(); POS pos = grammar.getPartOfSpeechString(posId); - long params = lex.parameters(wordId); - short leftId = WordParameters.leftId(params); - short rightId = WordParameters.rightId(params); - short cost = WordParameters.cost(params); - field(lex.string(dic, info.getSurface())); - field(leftId); - field(rightId); - field(cost); if (posMode == POSMode.ID || posMode == POSMode.BOTH) { field(posId); } @@ -194,13 +199,16 @@ void printEntry(int wordId) { field(pos.get(4)); field(pos.get(5)); } + field(lex.string(dic, info.getReadingForm())); field(wordRefHeadword(info.getNormalizedForm(), wordId)); field(wordRef(info.getDictionaryForm(), wordId)); + field(wordRefList(info.getAunitSplit())); field(wordRefList(info.getBunitSplit())); field(wordRefList(info.getCunitSplit())); field(wordRefList(info.getWordStructure())); + field(intList(info.getSynonymGroupIds())); lastField(info.getUserData()); output.print("\n"); @@ -235,17 +243,17 @@ String wordRef(int wordId, int reference) { String wordRef(int wordId) { WordInfo info = lex.getWordInfo(wordId); int dic = WordId.dic(wordId); - String surface = lex.string(dic, info.getSurface()); + String headword = lex.string(dic, info.getHeadword()); short posId = info.getPOSId(); String reading = lex.string(dic, info.getReadingForm()); List parts; if (wordRefMode == WordRefMode.TRIPLE_ID) { - parts = Arrays.asList(surface, String.valueOf(posId), reading); + parts = Arrays.asList(headword, String.valueOf(posId), reading); } else { POS pos = grammar.getPartOfSpeechString(posId); parts = new ArrayList<>(1 + POS.DEPTH + 1); - parts.add(surface); + parts.add(headword); parts.addAll(pos); parts.add(reading); } @@ -261,7 +269,7 @@ String wordRefHeadword(int wordId, int reference) { } int dic = WordId.dic(wordId); WordInfo info = lex.getWordInfo(wordId); - return lex.string(dic, info.getSurface()); + return lex.string(dic, info.getHeadword()); } String wordRefList(int[] wordIds) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java index 423d9a2e..b59011bc 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java @@ -174,9 +174,9 @@ public void calculateDynamicCosts(Tokenizer tokenizer) { if (isNormalCost(WordParameters.cost(parameters(wordId)))) { continue; } - int surfPtr = wordInfos.surfacePtr(wordId); - String surface = strings.string(surfPtr); - MorphemeList ms = tokenizer.tokenize(surface); + int headwordPtr = wordInfos.headwordPtr(wordId); + String headword = strings.string(headwordPtr); + MorphemeList ms = tokenizer.tokenize(headword); int cost = ms.getInternalCost() + USER_DICT_COST_PAR_MORPH * ms.size(); if (cost > Short.MAX_VALUE) { cost = Short.MAX_VALUE; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java index 8f5e5930..b7ad2d5b 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java @@ -27,9 +27,9 @@ * This class holds morpheme data which is not used in the viterbi search. */ public class WordInfo { - private final short headwordLength; + private final short indexFormLength; private short posId; - private final int surface; // StringPtr + private final int headword; // StringPtr private final int reading; // StringPtr private final int normalizedForm; // word ref private final int dictionaryForm; // word ref @@ -40,12 +40,12 @@ public class WordInfo { private final int[] synonymGids; private final String userData; - public WordInfo(short headwordLength, short posId, int surface, int reading, int normalizedForm, int dictionaryForm, - int[] aUnitSplit, int[] bUnitSplit, int[] cUnitSplit, int[] wordStructure, int[] synonymGids, - String userData) { - this.headwordLength = headwordLength; + public WordInfo(short indexFormLength, short posId, int headword, int reading, int normalizedForm, + int dictionaryForm, int[] aUnitSplit, int[] bUnitSplit, int[] cUnitSplit, int[] wordStructure, + int[] synonymGids, String userData) { + this.indexFormLength = indexFormLength; this.posId = posId; - this.surface = surface; + this.headword = headword; this.reading = reading; this.normalizedForm = normalizedForm; this.dictionaryForm = dictionaryForm; @@ -61,15 +61,15 @@ public WordInfo(short headwordLength, short posId, int surface, int reading, int * Allocates morpheme information for ones not in the lexicon. For example, * OOVs. * - * @param headwordLength + * @param indexFormLength * the length of the morpheme * @param posId * the ID of the part-of-speech of the morpheme */ - public WordInfo(short headwordLength, short posId) { - this.headwordLength = headwordLength; + public WordInfo(short indexFormLength, short posId) { + this.indexFormLength = indexFormLength; this.posId = posId; - this.surface = 0; + this.headword = 0; this.normalizedForm = 0; this.dictionaryForm = 0; this.reading = 0; @@ -87,8 +87,8 @@ public WordInfo(short headwordLength, short posId) { * @return raw string pointer to the text * @see StringPtr */ - public int getSurface() { - return surface; + public int getHeadword() { + return headword; } /** @@ -97,12 +97,12 @@ public int getSurface() { *

* This length is used to place a node in the * {@link com.worksap.nlp.sudachi.Lattice}, does not equals - * {@code getSurface().length()}. + * {@code getHeadword().length()}. * * @return the length of the text */ public short getLength() { - return headwordLength; + return indexFormLength; } /** @@ -226,13 +226,13 @@ public String getUserData() { } /** - * Read StringPtr to the surface form directly. + * Read StringPtr to the headword form directly. * * @param buffer * @param pos * @return */ - public static int surfaceForm(ByteBuffer buffer, int pos) { + public static int headwordForm(ByteBuffer buffer, int pos) { return buffer.getInt(pos + 8); } @@ -253,13 +253,13 @@ private WordInfo(ByteBuffer buffer, int pos) { // do not modify buffer metadata for better performance posId = buffer.getShort(pos + 6); - surface = surfaceForm(buffer, pos); // +8 + headword = headwordForm(buffer, pos); // +8 reading = readingForm(buffer, pos); // +12 normalizedForm = buffer.getInt(pos + 16); dictionaryForm = buffer.getInt(pos + 20); long rest = buffer.getLong(pos + 24); - headwordLength = (short) (rest & 0xffff); + indexFormLength = (short) (rest & 0xffff); rest >>>= 16; if (rest == 0) { cUnitSplit = Ints.EMPTY_ARRAY; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfoList.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfoList.java index 0a24e62b..813e9f50 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfoList.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfoList.java @@ -40,8 +40,8 @@ public WordInfo getWordInfo(int wordId) { return WordInfo.read(bytes, wordId2offset(wordId)); } - public int surfacePtr(int wordId) { - return WordInfo.surfaceForm(bytes, wordId2offset(wordId)); + public int headwordPtr(int wordId) { + return WordInfo.headwordForm(bytes, wordId2offset(wordId)); } public int readingPtr(int wordId) { diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CompiledWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CompiledWordEntry.java index 8809f7a6..081c8055 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CompiledWordEntry.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/CompiledWordEntry.java @@ -59,6 +59,6 @@ public boolean matches(short posId, String reading) { @Override public String headword() { WordInfo wi = wordInfo(); - return lexicon.string(0, wi.getSurface()); + return lexicon.string(0, wi.getHeadword()); } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/EntryLookup.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/EntryLookup.java index 08a81f79..fba47e20 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/EntryLookup.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/EntryLookup.java @@ -22,7 +22,7 @@ import java.util.Map; /** - * Utility to lookup entries from the list. + * Utility to lookup entries from the list. Used to resolve {@link WordRef}. */ public class EntryLookup { public interface Entry { @@ -104,7 +104,7 @@ public EntryWithFlag byIndex(int index, boolean isUser) { } /** - * Lookup entries by the headword (surface). + * Lookup entries by the headword. * * @param headword * @return diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java index 590e2272..6cd9e3bb 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java @@ -28,9 +28,10 @@ /** * Dictionary Parts: Trie index and corresponding word id table. * - * TRIE maps headwords to offset for WordIdTable. WordIdTable contains the list - * of word-ids of words which have the target headword. WordId here means offset - * in WordEntryTable (with last n bits dropped). + * TRIE maps index-forms to offset for WordIdTable. WordIdTable contains the + * list of word-ids of words which have the target index-form. WordId here means + * offset in WordEntryTable (with last n bits dropped as defined in + * {@link com.worksap.nlp.sudachi.dictionary.WordInfoList}). * * WordIdTable also contins word-ids that are not indexed in TRIE, so that we * can iterate over all word entries. @@ -48,7 +49,7 @@ public class Index { }); /** - * Add a (headword, wordid) pair to the index + * Add a (index-form, wordid) pair to the index * * @param key * @param wordId @@ -124,7 +125,7 @@ private TrieData writeWordTable(BlockOutput out, List data) { /** convert csv row to RawWordEntry */ private RawWordEntry convertEntry(List data) { RawWordEntry entry = new RawWordEntry(); - entry.surface = getNonEmpty(data, Column.SURFACE, true); - String writing = get(data, Column.WRITING, true); - entry.writing = writing.isEmpty() ? entry.surface : writing; + entry.indexForm = getNonEmpty(data, Column.INDEX_FORM, true); + String headword = get(data, Column.HEADWORD, true); + entry.headword = headword.isEmpty() ? entry.indexForm : headword; entry.leftId = getShort(data, Column.LEFT_ID); entry.rightId = getShort(data, Column.RIGHT_ID); @@ -305,7 +305,7 @@ private RawWordEntry convertEntry(List data) { entry.reading = get(data, Column.READING_FORM, true); entry.posId = getPos(data); - // writing, pos, reading must be parsed before these to resolve wordref. + // headword, pos, reading must be parsed before these to resolve wordref. entry.normalizedFormRef = getWordRef(data, Column.NORMALIZED_FORM, normRefParser, entry); entry.dictionaryFormRef = getWordRef(data, Column.DICTIONARY_FORM, dictRefParser, entry); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java index fcac6487..0d670535 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/RawWordEntry.java @@ -30,8 +30,8 @@ @SuppressWarnings("jol") public class RawWordEntry implements EntryLookup.Entry { int pointer; // wordid, compressed offset of this entry in the lexicon.WordEntries - String surface; - String writing; + String indexForm; + String headword; short leftId; short rightId; short cost; @@ -92,8 +92,7 @@ public boolean matches(short posId, String reading) { @Override public String headword() { - // use writing for entry lookup - return writing; + return headword; } private void checkString(String value, String name) { @@ -106,8 +105,8 @@ private void checkString(String value, String name) { /** check if sudachi dictionary can handle this entry */ public void validate() { - checkString(surface, "surface"); - checkString(writing, "writing"); + checkString(indexForm, "index form"); + checkString(headword, "headword"); checkString(reading, "reading"); if (normalizedFormRef instanceof WordRef.RefByHeadword) { checkString(((WordRef.RefByHeadword) normalizedFormRef).getHeadword(), "normalized form"); @@ -121,9 +120,10 @@ public void validate() { * storage to publish strings. */ public void publishStrings(StringStorage strings) { - // surface is used only for indexing and is not necessary to store - strings.add(writing); + strings.add(headword); strings.add(reading); + // allow referring a non-existing word in the normalized form, which needs to be + // published if (normalizedFormRef instanceof WordRef.RefByHeadword) { strings.add(((WordRef.RefByHeadword) normalizedFormRef).getHeadword()); } @@ -134,8 +134,8 @@ public void publishStrings(StringStorage strings) { */ public static RawWordEntry makeEmpty() { RawWordEntry entry = new RawWordEntry(); - entry.surface = ""; - entry.writing = ""; + entry.indexForm = ""; + entry.headword = ""; entry.leftId = -1; entry.rightId = -1; entry.cost = Short.MAX_VALUE; @@ -158,11 +158,11 @@ public static RawWordEntry makeEmpty() { * Create phantom entry, that is referred for the normalized form of the base * entry. */ - public static RawWordEntry makePhantom(RawWordEntry base, String surface) { + public static RawWordEntry makePhantom(RawWordEntry base, String headword) { RawWordEntry entry = new RawWordEntry(); - // keep surface empty, phantom entry will only be accessed via normalized form - entry.surface = ""; - entry.writing = surface; + // keep index form empty, phantom entry will only be accessed via word reference + entry.indexForm = ""; + entry.headword = headword; // phantom entry should not be used in the analysis entry.leftId = -1; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java index 22843005..47a8bcb7 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordEntryLayout.java @@ -66,7 +66,7 @@ public int put(RawWordEntry entry) throws IOException { buf.putShort(entry.posId); // 2*4 = 8 bytes - buf.putInt(index.resolve(entry.writing).encode()); // surface StringPtr + buf.putInt(index.resolve(entry.headword).encode()); // headword StringPtr buf.putInt(index.resolve(entry.reading).encode()); // reading StringPtr int selfPtr = isUser ? WordId.make(1, entry.pointer) : entry.pointer; int normFormPtr = entry.normalizedFormRef == null ? selfPtr : entry.normalizedFormRef.resolve(lookup); @@ -76,7 +76,7 @@ public int put(RawWordEntry entry) throws IOException { // 8 + 4*4 = 24 bytes // length can't be more than ~4k utf-16 code units so the cast is safe - short utf8Len = (short) StringUtil.countUtf8Bytes(entry.surface); + short utf8Len = (short) StringUtil.countUtf8Bytes(entry.indexForm); byte cSplitLen = resolveWordRefList(entry.cUnitSplit, null, cSplits); byte bSplitLen = resolveWordRefList(entry.bUnitSplit, entry.cUnitSplit, bSplits); byte aSplitLen = resolveWordRefList(entry.aUnitSplit, entry.bUnitSplit, aSplits); diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java index 4a33b560..0b87072d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordRef.java @@ -91,7 +91,7 @@ public int hashCode() { } /** - * Reference written by surface. + * Reference written by headword. */ public static final class RefByHeadword extends WordRef { private final String headword; @@ -107,6 +107,8 @@ public String getHeadword() { @Override public int resolve(EntryLookup resolver) { List entries = resolver.byHeadword(headword); + // Use the first entry. This is ok since RefByHeadword is only allowed for the + // normalized form and only the headword of the referred entry will be used. return intoWordRef(entries.get(0)); } @@ -132,7 +134,7 @@ public int hashCode() { } /** - * Reference written by surface-pos-reading tuple. + * Reference written by headword-pos-reading tuple. */ public static final class RefByTriple extends WordRef { private final String headword; @@ -173,7 +175,7 @@ public int resolve(EntryLookup resolver) { @Override public String toString() { - return String.format("WordRef: %s/%d/%s", headword, posId, reading); + return String.format("WordRef/Triple: %s/%d/%s", headword, posId, reading); } @Override @@ -228,6 +230,7 @@ public WordRef parse(String text) { return new RefByLineNo(lineNum, isUser); } + // triple, pos is written as 6-parts if (StringUtil.count(text, WORDREF_DELIMITER) == 7) { String[] cols = text.split(String.valueOf(WORDREF_DELIMITER), 8); String headword = Unescape.unescape(cols[0]); @@ -241,6 +244,7 @@ public WordRef parse(String text) { return new RefByTriple(headword, posId, reading); } + // triple, pos is written as pos-id if (StringUtil.count(text, WORDREF_DELIMITER) == 2) { String[] cols = text.split(String.valueOf(WORDREF_DELIMITER), 3); String headword = Unescape.unescape(cols[0]); @@ -251,10 +255,9 @@ public WordRef parse(String text) { if (allowHeadword) { return new RefByHeadword(Unescape.unescape(text)); - } else { - throw new IllegalArgumentException(String.format("invalid word reference: %s", text)); } - } + throw new IllegalArgumentException(String.format("invalid word reference: %s", text)); + } } } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.kt index 6325a6e5..0dafaf75 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderTest.kt @@ -83,7 +83,7 @@ class DictionaryBuilderTest { assertEquals(0, WordParameters.leftId(params)) assertEquals(100, WordParameters.cost(params)) var wi = lexicon.getWordInfo(wordId) - assertEquals("東京都", lexicon.string(0, wi.getSurface())) + assertEquals("東京都", lexicon.string(0, wi.getHeadword())) assertEquals("ヒガシキョウト", lexicon.string(0, wi.getReadingForm())) assertEquals(WordId.make(0, wordId), wi.getNormalizedForm()) assertEquals(WordId.make(0, wordId), wi.getDictionaryForm()) @@ -103,7 +103,7 @@ class DictionaryBuilderTest { assertEquals(-1, WordParameters.leftId(params)) assertEquals(200, WordParameters.cost(params)) wi = lexicon.getWordInfo(wordId) - assertEquals("東", lexicon.string(0, wi.getSurface())) + assertEquals("東", lexicon.string(0, wi.getHeadword())) assertEquals("ヒガシ", lexicon.string(0, wi.getReadingForm())) assertEquals(WordId.make(0, wordIds[3]), wi.getNormalizedForm()) assertEquals(WordId.make(0, wordId), wi.getDictionaryForm()) @@ -130,7 +130,7 @@ class DictionaryBuilderTest { inputFile .toFile() .writeText( - """Surface,leftId,rightId,cost,writing,posId,readingform,normalizedform,dictionaryform,mode,splitA,splitB,wordstructure,synonymgroups + """Index_Form,leftId,rightId,cost,Headword,posId,readingform,normalizedform,dictionaryform,mode,splitA,splitB,wordstructure,synonymgroups 東京都,0,0,100,東京都,1,ヒガシキョウト,東京都,,B,"東,名詞,普通名詞,一般,*,*,*,ヒガシ/京都,1,キョウト",,"東,名詞,普通名詞,一般,*,*,*,ヒガシ/京都,1,キョウト",1/2 東,-1,-1,200,東,0,ヒガシ,ひがし,,A,,,, 京都,0,0,300,京都,1,キョウト,京都,,A,,,,""") @@ -167,7 +167,7 @@ class DictionaryBuilderTest { assertEquals(0, WordParameters.leftId(params)) assertEquals(100, WordParameters.cost(params)) var wi = lexicon.getWordInfo(wordId) - assertEquals("東京都", lexicon.string(0, wi.getSurface())) + assertEquals("東京都", lexicon.string(0, wi.getHeadword())) assertEquals("ヒガシキョウト", lexicon.string(0, wi.getReadingForm())) assertEquals(WordId.make(0, wordId), wi.getNormalizedForm()) assertEquals(WordId.make(0, wordId), wi.getDictionaryForm()) diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt index d142dc75..7785ad37 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.kt @@ -65,7 +65,7 @@ class DictionaryPrinterTest { fun wordInfoString(lex: DoubleArrayLexicon, wordId: Int): String { val wi = lex.getWordInfo(wordId) - return "${wordId}, ${lex.string(0, wi.getSurface())}, ${wi.getLength()}, ${wi.getPOSId()}, ${wi.getNormalizedForm()}, ${wi.getDictionaryForm()}, ${lex.string(0, wi.getReadingForm())}, ${Arrays.toString(wi.getAunitSplit())}, ${Arrays.toString(wi.getBunitSplit())}, ${Arrays.toString(wi.getCunitSplit())}, ${Arrays.toString(wi.getWordStructure())}, ${Arrays.toString(wi.getSynonymGroupIds())}, ${wi.getUserData()}" + return "${wordId}, ${lex.string(0, wi.getHeadword())}, ${wi.getLength()}, ${wi.getPOSId()}, ${wi.getNormalizedForm()}, ${wi.getDictionaryForm()}, ${lex.string(0, wi.getReadingForm())}, ${Arrays.toString(wi.getAunitSplit())}, ${Arrays.toString(wi.getBunitSplit())}, ${Arrays.toString(wi.getCunitSplit())}, ${Arrays.toString(wi.getWordStructure())}, ${Arrays.toString(wi.getSynonymGroupIds())}, ${wi.getUserData()}" } @Test @@ -76,13 +76,15 @@ class DictionaryPrinterTest { assertEquals(43, lines.size) // header + entries + trailing new line assertEquals( - "SURFACE,LEFT_ID,RIGHT_ID,COST,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", + "INDEX_FORM,LEFT_ID,RIGHT_ID,COST,HEADWORD,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", lines[0]) - assertEquals("た,1,1,8729,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,,,,,,,,", lines[1]) - assertEquals("に,2,2,11406,助詞,接続助詞,*,*,*,*,ニ,,,,,,,,", lines[2]) + assertEquals("た,1,1,8729,,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,,,,,,,,", lines[1]) + assertEquals("に,2,2,11406,,助詞,接続助詞,*,*,*,*,ニ,,,,,,,,", lines[2]) assertEquals( - "東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,\"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/都,名詞,普通名詞,一般,*,*,*,ト\",,,\"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/都,名詞,普通名詞,一般,*,*,*,ト\",,", + "東京都,6,8,5320,,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,\"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/都,名詞,普通名詞,一般,*,*,*,ト\",,,\"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/都,名詞,普通名詞,一般,*,*,*,ト\",,", lines[7]) + // assertEquals("特a,8,8,2914,特A,名詞,普通名詞,一般,*,*,*,トクエー,,,,,,,", lines[38]) // TODO: after #241 + // fixed } @Test @@ -93,10 +95,10 @@ class DictionaryPrinterTest { assertEquals(43, lines.size) // header + entries + trailing new line assertEquals( - "SURFACE,LEFT_ID,RIGHT_ID,COST,POS_ID,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", + "INDEX_FORM,LEFT_ID,RIGHT_ID,COST,HEADWORD,POS_ID,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", lines[0]) - assertEquals("た,1,1,8729,0,タ,,,,,,,,", lines[1]) - assertEquals("に,2,2,11406,1,ニ,,,,,,,,", lines[2]) + assertEquals("た,1,1,8729,,0,タ,,,,,,,,", lines[1]) + assertEquals("に,2,2,11406,,1,ニ,,,,,,,,", lines[2]) } @Test @@ -107,10 +109,10 @@ class DictionaryPrinterTest { assertEquals(43, lines.size) // header + entries + trailing new line assertEquals( - "SURFACE,LEFT_ID,RIGHT_ID,COST,POS_ID,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", + "INDEX_FORM,LEFT_ID,RIGHT_ID,COST,HEADWORD,POS_ID,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", lines[0]) - assertEquals("た,1,1,8729,0,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,,,,,,,,", lines[1]) - assertEquals("に,2,2,11406,1,助詞,接続助詞,*,*,*,*,ニ,,,,,,,,", lines[2]) + assertEquals("た,1,1,8729,,0,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,,,,,,,,", lines[1]) + assertEquals("に,2,2,11406,,1,助詞,接続助詞,*,*,*,*,ニ,,,,,,,,", lines[2]) } @Test @@ -121,10 +123,10 @@ class DictionaryPrinterTest { assertEquals(43, lines.size) // header + entries + trailing new line assertEquals( - "SURFACE,LEFT_ID,RIGHT_ID,COST,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", + "INDEX_FORM,LEFT_ID,RIGHT_ID,COST,HEADWORD,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", lines[0]) assertEquals( - "東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,\"東京,3,トウキョウ/都,4,ト\",,,\"東京,3,トウキョウ/都,4,ト\",,", + "東京都,6,8,5320,,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,\"東京,3,トウキョウ/都,4,ト\",,,\"東京,3,トウキョウ/都,4,ト\",,", lines[7]) } @@ -136,12 +138,12 @@ class DictionaryPrinterTest { assertEquals(6, lines.size) // header + entries + trailing new line assertEquals( - "SURFACE,LEFT_ID,RIGHT_ID,COST,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", + "INDEX_FORM,LEFT_ID,RIGHT_ID,COST,HEADWORD,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", lines[0]) assertEquals( - "東京府,6,6,2816,名詞,固有名詞,地名,一般,*,*,トウキョウフ,,,\"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/府,名詞,普通名詞,一般,*,*,*,フ\",,,\"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/府,名詞,普通名詞,一般,*,*,*,フ\",1/3,", + "東京府,6,6,2816,,名詞,固有名詞,地名,一般,*,*,トウキョウフ,,,\"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/府,名詞,普通名詞,一般,*,*,*,フ\",,,\"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/府,名詞,普通名詞,一般,*,*,*,フ\",1/3,", lines[3]) - assertEquals("すだち,6,6,2816,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,,,,,,,,徳島県産", lines[4]) + assertEquals("すだち,6,6,2816,,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,,,,,,,,徳島県産", lines[4]) } @Test @@ -156,10 +158,10 @@ class DictionaryPrinterTest { assertEquals(6, lines.size) // header + entries + trailing new line assertEquals( - "SURFACE,LEFT_ID,RIGHT_ID,COST,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", + "INDEX_FORM,LEFT_ID,RIGHT_ID,COST,HEADWORD,POS1,POS2,POS3,POS4,POS5,POS6,READING_FORM,NORMALIZED_FORM,DICTIONARY_FORM,SPLIT_A,SPLIT_B,SPLIT_C,WORD_STRUCTURE,SYNONYM_GROUPS,USER_DATA", lines[0]) assertEquals( - "東京府,6,6,2816,名詞,固有名詞,地名,一般,*,*,トウキョウフ,,,\"東京,3,トウキョウ/府,4,フ\",,,\"東京,3,トウキョウ/府,4,フ\",1/3,", + "東京府,6,6,2816,,名詞,固有名詞,地名,一般,*,*,トウキョウフ,,,\"東京,3,トウキョウ/府,4,フ\",,,\"東京,3,トウキョウ/府,4,フ\",1/3,", lines[3]) } diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt index 136ec402..0109a2a1 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.kt @@ -94,11 +94,11 @@ class DoubleArrayLexiconTest { fun wordInfo() { // た var wi = lexicon.getWordInfo(getWordId(0)) - assertEquals("た", lexicon.string(0, wi.getSurface())) + assertEquals("た", lexicon.string(0, wi.getHeadword())) assertEquals(3, wi.getLength()) assertEquals(0, wi.getPOSId()) - assertEquals("た", lexicon.string(0, lexicon.getWordInfo(wi.getNormalizedForm()).getSurface())) - assertEquals("た", lexicon.string(0, lexicon.getWordInfo(wi.getDictionaryForm()).getSurface())) + assertEquals("た", lexicon.string(0, lexicon.getWordInfo(wi.getNormalizedForm()).getHeadword())) + assertEquals("た", lexicon.string(0, lexicon.getWordInfo(wi.getDictionaryForm()).getHeadword())) assertEquals("タ", lexicon.string(0, wi.getReadingForm())) assertEquals(listOf(), wi.getAunitSplit().toList()) assertEquals(listOf(), wi.getBunitSplit().toList()) @@ -106,13 +106,13 @@ class DoubleArrayLexiconTest { // 行っ wi = lexicon.getWordInfo(getWordId(8)) - assertEquals("行っ", lexicon.string(0, wi.getSurface())) - assertEquals("行く", lexicon.string(0, lexicon.getWordInfo(wi.getNormalizedForm()).getSurface())) - assertEquals("行く", lexicon.string(0, lexicon.getWordInfo(wi.getDictionaryForm()).getSurface())) + assertEquals("行っ", lexicon.string(0, wi.getHeadword())) + assertEquals("行く", lexicon.string(0, lexicon.getWordInfo(wi.getNormalizedForm()).getHeadword())) + assertEquals("行く", lexicon.string(0, lexicon.getWordInfo(wi.getDictionaryForm()).getHeadword())) // 東京都 wi = lexicon.getWordInfo(getWordId(6)) - assertEquals("東京都", lexicon.string(0, wi.getSurface())) + assertEquals("東京都", lexicon.string(0, wi.getHeadword())) assertEquals(listOf(getWordId(5), getWordId(9)), wi.getAunitSplit().toList()) assertEquals(listOf(), wi.getBunitSplit().toList()) assertEquals(listOf(getWordId(5), getWordId(9)), wi.getWordStructure().toList()) @@ -127,7 +127,7 @@ class DoubleArrayLexiconTest { // すだち val wi = userlex.getWordInfo(18) - assertEquals("すだち", userlex.string(0, wi.getSurface())) + assertEquals("すだち", userlex.string(0, wi.getHeadword())) assertEquals(8, wi.getPOSId()) assertEquals("徳島県産", wi.getUserData()) } @@ -136,12 +136,14 @@ class DoubleArrayLexiconTest { fun wordInfoLong() { // 0123456789 * 30 val wi = lexicon.getWordInfo(getWordId(36)) - val surface = lexicon.string(0, wi.getSurface()) + val surface = lexicon.string(0, wi.getHeadword()) assertEquals(300, surface.length) assertEquals(300, wi.getLength()) - val normalizedform = lexicon.string(0, lexicon.getWordInfo(wi.getNormalizedForm()).getSurface()) + val normalizedform = + lexicon.string(0, lexicon.getWordInfo(wi.getNormalizedForm()).getHeadword()) assertEquals(300, normalizedform.length) - val dictionaryform = lexicon.string(0, lexicon.getWordInfo(wi.getDictionaryForm()).getSurface()) + val dictionaryform = + lexicon.string(0, lexicon.getWordInfo(wi.getDictionaryForm()).getHeadword()) assertEquals(300, dictionaryform.length) val readingform = lexicon.string(0, wi.getReadingForm()) assertEquals(570, readingform.length) diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilderTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilderTest.java index 5ca512f9..d27cf740 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilderTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/UserDictionaryBuilderTest.java @@ -79,7 +79,7 @@ public void commandLine() throws IOException { assertThat(WordParameters.leftId(param), is((short) 0)); assertThat(WordParameters.cost(param), is((short) 0)); WordInfo info = lexicon.getWordInfo(wordId); - assertThat(lexicon.string(0, info.getSurface()), is("東京都市")); + assertThat(lexicon.string(0, info.getHeadword()), is("東京都市")); assertThat(info.getNormalizedForm(), is(WordId.make(1, wordId))); assertThat(info.getDictionaryForm(), is(WordId.make(1, wordId))); assertThat(lexicon.string(0, info.getReadingForm()), is("ヒガシキョウトシ")); @@ -99,7 +99,7 @@ public void commandLine() throws IOException { assertThat(WordParameters.leftId(param), is((short) -1)); assertThat(WordParameters.cost(param), is((short) 0)); info = lexicon.getWordInfo(wordId); - assertThat(lexicon.string(0, info.getSurface()), is("市")); + assertThat(lexicon.string(0, info.getHeadword()), is("市")); assertThat(info.getNormalizedForm(), is(WordId.make(1, wordId))); assertThat(info.getDictionaryForm(), is(WordId.make(1, wordId))); assertThat(lexicon.string(0, info.getReadingForm()), is("シ")); diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt index b5152226..580fd364 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/build/RawLexiconReaderTest.kt @@ -45,7 +45,7 @@ class RawLexiconReaderTest { fun legacyCsvWithMinimumFields() { val reader = RawLexiconReader(csvfile("legacy-minimum.csv"), POSTable()) assertNotNull(reader.nextEntry()).let { e -> - assertEquals("東京都", e.surface) + assertEquals("東京都", e.indexForm) assertEquals("東京都", e.headword()) assertEquals("トウキョウト", e.reading) assertEquals( @@ -61,7 +61,7 @@ class RawLexiconReaderTest { fun legacyCsvWithAllFields() { val reader = RawLexiconReader(csvfile("legacy-full.csv"), POSTable()) assertNotNull(reader.nextEntry()).let { e -> - assertEquals("東京都", e.surface) + assertEquals("東京都", e.indexForm) assertEquals("東京都", e.headword()) assertEquals("トウキョウト", e.reading) assertEquals( @@ -78,8 +78,8 @@ class RawLexiconReaderTest { fun headerCsvMinimumFields() { val reader = RawLexiconReader(csvfile("headers-minimum.csv"), POSTable()) assertNotNull(reader.nextEntry()).let { e -> - assertEquals("東京都", e.surface) - assertEquals("東京都", e.headword()) // surface is used for missing writing + assertEquals("東京都", e.indexForm) + assertEquals("東京都", e.headword()) // indexForm is used for missing headword assertEquals("トウキョウト", e.reading) assertEquals( listOf(WordRef.RefByTriple("東京", 0, "トウキョウ"), WordRef.RefByTriple("都", 1, "ト")), @@ -99,7 +99,7 @@ class RawLexiconReaderTest { fun headerCsvAllFields() { val reader = RawLexiconReader(csvfile("headers-all.csv"), POSTable()) assertNotNull(reader.nextEntry()).let { e -> - assertEquals("東京都", e.surface) + assertEquals("東京都", e.indexForm) assertEquals("東京都", e.headword()) assertEquals("トウキョウト", e.reading) assertEquals( @@ -122,16 +122,16 @@ class RawLexiconReaderTest { } @Test - fun parseWriting() { + fun parseHeadword() { val text = - """Surface,LeftId,RightId,Cost,writing,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure + """IndexForm,LeftId,RightId,Cost,headword,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure abc,0,0,1000,AbC,0,トウキョウト,,,,,""" val posTable = POSTable() posTable.getId(POS("a", "a", "a", "a", "a", "0")) val reader = RawLexiconReader(csvtext(text), posTable) assertNotNull(reader.nextEntry()).let { e -> - assertEquals("abc", e.surface) + assertEquals("abc", e.indexForm) assertEquals("AbC", e.headword()) } assertNull(reader.nextEntry()) @@ -141,7 +141,7 @@ abc,0,0,1000,AbC,0,トウキョウト,,,,,""" fun failMissingRequiredEntry() { // pos1-6 are not required (because of posId), but must be used as a set val columns = - "Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure".split( + "IndexForm,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure".split( ",") val values = "東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,".split(",") @@ -159,7 +159,7 @@ abc,0,0,1000,AbC,0,トウキョウト,,,,,""" @Test fun posIdOnly() { val text = - """Surface,LeftId,RightId,Cost,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure + """IndexForm,LeftId,RightId,Cost,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure 東京都,6,8,5320,0,トウキョウト,,,,,""" val posTable = POSTable() posTable.getId(POS("a", "a", "a", "a", "a", "0")) @@ -172,7 +172,7 @@ abc,0,0,1000,AbC,0,トウキョウト,,,,,""" @Test fun failNonExistingPosId() { val text = - """Surface,LeftId,RightId,Cost,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure + """IndexForm,LeftId,RightId,Cost,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure 東京都,6,8,5320,1,トウキョウト,,,,,""" val posTable = POSTable() posTable.getId(POS("a", "a", "a", "a", "a", "0")) @@ -185,7 +185,7 @@ abc,0,0,1000,AbC,0,トウキョウト,,,,,""" @Test fun posIdAndParts() { val text = - """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure + """IndexForm,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,0,トウキョウト,,,,,""" val posTable = POSTable() @@ -197,7 +197,7 @@ abc,0,0,1000,AbC,0,トウキョウト,,,,,""" @Test fun posIdAndEmptyParts() { val text = - """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure + """IndexForm,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure 東京都,6,8,5320,,,,,,,0,トウキョウト,,,,,""" val posTable = POSTable() posTable.getId(POS("a", "a", "a", "a", "a", "0")) @@ -210,7 +210,7 @@ abc,0,0,1000,AbC,0,トウキョウト,,,,,""" @Test fun posPartsAndEmptyPosId() { val text = - """Surface,LeftId,RightId,Cost,pos_id,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure + """IndexForm,LeftId,RightId,Cost,pos_id,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure 東京都,6,8,5320,0,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,""" val posTable = POSTable() @@ -222,7 +222,7 @@ abc,0,0,1000,AbC,0,トウキョウト,,,,,""" @Test fun failPosIdAndPartsNotMatch() { val text = - """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure + """IndexForm,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,0,トウキョウト,,,,,""" val posTable = POSTable() posTable.getId(POS("a", "a", "a", "a", "a", "0")) @@ -236,7 +236,7 @@ abc,0,0,1000,AbC,0,トウキョウト,,,,,""" @Test fun failPosColumnMissing() { val text = - """Surface,LeftId,RightId,Cost,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure + """IndexForm,LeftId,RightId,Cost,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure 東京都,6,8,5320,トウキョウト,,,,,""" val posTable = POSTable() posTable.getId(POS("a", "a", "a", "a", "a", "0")) @@ -250,7 +250,7 @@ abc,0,0,1000,AbC,0,トウキョウト,,,,,""" @Test fun failPosColumnEmpty() { val text = - """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure + """IndexForm,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,pos_id,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure 東京都,6,8,5320,,,,,,,,トウキョウト,,,,,""" val posTable = POSTable() posTable.getId(POS("a", "a", "a", "a", "a", "0")) @@ -266,28 +266,28 @@ abc,0,0,1000,AbC,0,トウキョウト,,,,,""" val oversizeWord = "a".repeat(StringPtr.MAX_LENGTH + 1) run { val text = - """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure + """IndexForm,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure ${oversizeWord},6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,1,,,""" val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } run { val text = - """Surface,LeftId,RightId,Cost,writing,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure + """IndexForm,LeftId,RightId,Cost,headword,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,${oversizeWord},名詞,固有名詞,地名,一般,*,*,トウキョウト,,,1,,,""" val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } run { val text = - """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure + """IndexForm,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,${oversizeWord},,,1,,,""" val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } run { val text = - """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure + """IndexForm,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,${oversizeWord},,1,,,""" val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } @@ -295,9 +295,9 @@ ${oversizeWord},6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウ } @Test - fun failEmptySurface() { + fun failEmptyIndexForm() { val text = - """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure + """IndexForm,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,wordstructure ,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,""" val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } @@ -308,28 +308,28 @@ ${oversizeWord},6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウ fun failSingleSplit() { run { val text = - """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure + """IndexForm,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,1,,,""" val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } run { val text = - """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure + """IndexForm,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,1,,""" val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } run { val text = - """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure + """IndexForm,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,1,""" val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } run { val text = - """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure + """IndexForm,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,,1""" val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } @@ -343,28 +343,28 @@ ${oversizeWord},6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウ run { var text = - """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure + """IndexForm,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,${oversizeSplit},,,""" var reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } run { val text = - """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure + """IndexForm,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,${oversizeSplit},,""" val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } run { val text = - """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure + """IndexForm,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,${oversizeSplit},""" val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } } run { val text = - """Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure + """IndexForm,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,,${oversizeSplit}""" val reader = RawLexiconReader(csvtext(text), POSTable()) assertFails { reader.nextEntry() } diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv index d5180268..0b24ce09 100644 --- a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-all.csv @@ -1,3 +1,3 @@ -Surface,LeftId,RightId,Cost,pos_id,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,mode,splita,splitb,splitc,wordstructure,synonymgroups,userdata,Writing +IndexForm,LeftId,RightId,Cost,pos_id,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,mode,splita,splitb,splitc,wordstructure,synonymgroups,userdata,Headword 東京都,6,8,5320,,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,B,"東京,0,トウキョウ/都,1,ト","東京,0,トウキョウ/都,2,ト","東京,0,トウキョウ/都,3,ト","東京,0,トウキョウ/都,4,ト",8/9,10,東京都 行く,4,4,5105,,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,,,A,,,,,,, diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-minimum.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-minimum.csv index 95fca9d9..67231efb 100644 --- a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-minimum.csv +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/headers-minimum.csv @@ -1,3 +1,3 @@ -Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,wordStructure +IndexForm,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,wordStructure 東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,"東京,0,トウキョウ/都,1,ト","東京,0,トウキョウ/都,2,ト","東京,0,トウキョウ/都,3,ト" 行く,4,4,5105,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,,,,, diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/wordref-user.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/wordref-user.csv index 41002a00..ba06b249 100644 --- a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/wordref-user.csv +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/wordref-user.csv @@ -1,3 +1,3 @@ -Surface,LeftId,RightId,Cost,pos_id,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Mode,Split_A,Split_B,Split_C,WordStructure,SynonymGroups +IndexForm,LeftId,RightId,Cost,pos_id,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Mode,Split_A,Split_B,Split_C,WordStructure,SynonymGroups 府,2,2,2914,1,名詞,普通名詞,一般,*,*,*,フ,,,A,,,,, 東京府,2,2,2816,0,名詞,固有名詞,地名,一般,*,*,トウキョウフ,,,B,"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/府,1,フ",,,, diff --git a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/wordref.csv b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/wordref.csv index edc97876..68d42415 100644 --- a/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/wordref.csv +++ b/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/wordref.csv @@ -1,4 +1,4 @@ -Surface,LeftId,RightId,Cost,pos_id,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Mode,Split_A,Split_B,Split_C,WordStructure,SynonymGroups +IndexForm,LeftId,RightId,Cost,pos_id,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Mode,Split_A,Split_B,Split_C,WordStructure,SynonymGroups 京都,0,0,5293,0,名詞,固有名詞,地名,一般,*,*,キョウト,,,A,,,,, 東,1,1,4675,1,名詞,普通名詞,一般,*,*,*,ヒガシ,,,A,,,,, 東京,0,0,2816,0,名詞,固有名詞,地名,一般,*,*,トウキョウ,,,A,,,,, diff --git a/src/test/resources/dict/lex.csv b/src/test/resources/dict/lex.csv index e23d2e7e..75ebf5c0 100644 --- a/src/test/resources/dict/lex.csv +++ b/src/test/resources/dict/lex.csv @@ -1,42 +1,42 @@ -Surface,LeftId,RightId,Cost,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Split_A,Split_B,Split_C,WordStructure,SynonymGroups -た,1,1,8729,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,,,,,, -に,2,2,11406,助詞,接続助詞,*,*,*,*,ニ,に,,,,,, -に,3,3,4481,助詞,格助詞,*,*,*,*,ニ,に,,,,,, -京都,6,6,5293,名詞,固有名詞,地名,一般,*,*,キョウト,京都,,,,,,1/5 -東,7,7,4675,名詞,普通名詞,一般,*,*,*,ヒガシ,東,,,,,, -東京,6,6,2816,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,,,,,, -東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,,"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/都,名詞,普通名詞,一般,*,*,*,ト",,,"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/都,名詞,普通名詞,一般,*,*,*,ト", -行く,4,4,5105,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,,,,,, -行っ,5,5,5122,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,"行く,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク",,,,, -都,8,8,2914,名詞,普通名詞,一般,*,*,*,ト,都,,,,,, -アイ,7,7,4675,名詞,普通名詞,一般,*,*,*,アイ,アイ,,,,,, -アイウ,7,7,4675,名詞,普通名詞,一般,*,*,*,アイウ,アイウ,,,,,, -アイアイウ,6,6,32766,名詞,固有名詞,地名,一般,*,*,アイアイウ,アイアイウ,,,,,, -0,9,9,2478,名詞,数詞,*,*,*,*,ゼロ,0,,,,,, -1,9,9,2478,名詞,数詞,*,*,*,*,イチ,1,,,,,, -2,9,9,2478,名詞,数詞,*,*,*,*,ニ,2,,,,,, -3,9,9,2478,名詞,数詞,*,*,*,*,サン,3,,,,,, -4,9,9,2478,名詞,数詞,*,*,*,*,ヨン,4,,,,,, -5,9,9,2478,名詞,数詞,*,*,*,*,ゴ,5,,,,,, -6,9,9,2478,名詞,数詞,*,*,*,*,ロク,6,,,,,, -7,9,9,2478,名詞,数詞,*,*,*,*,ナナ,7,,,,,, -8,9,9,2478,名詞,数詞,*,*,*,*,ハチ,8,,,,,, -9,9,9,2478,名詞,数詞,*,*,*,*,キュウ,9,,,,,, -〇,9,9,2478,名詞,数詞,*,*,*,*,ゼロ,〇,,,,,, -一,9,9,2478,名詞,数詞,*,*,*,*,イチ,一,,,,,, -二,9,9,2478,名詞,数詞,*,*,*,*,ニ,二,,,,,, -三,9,9,2478,名詞,数詞,*,*,*,*,サン,三,,,,,, -四,9,9,2478,名詞,数詞,*,*,*,*,ヨン,四,,,,,, -五,9,9,2478,名詞,数詞,*,*,*,*,ゴ,五,,,,,, -六,9,9,2478,名詞,数詞,*,*,*,*,ロク,六,,,,,, -七,9,9,2478,名詞,数詞,*,*,*,*,ナナ,七,,,,,, -八,9,9,2478,名詞,数詞,*,*,*,*,ハチ,八,,,,,, -九,9,9,2478,名詞,数詞,*,*,*,*,キュウ,九,,,,,, -六三四,6,6,0,名詞,固有名詞,地名,一般,*,*,ムサシ,六三四,,,,,, -いく,4,4,5105,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,,,,,, -いっ,5,5,5122,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,"行く,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク","いく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク",,,,, -012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,-9000,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,,,,,, -特a,8,8,2914,名詞,普通名詞,一般,*,*,*,トクエー,特a,,,,,, -隠し,-1,-1,0,名詞,普通名詞,一般,*,*,*,カクシ,隠し,,,,,, -な。な,8,8,2914,名詞,普通名詞,一般,*,*,*,ナナ,な。な,,"アイウ,名詞,普通名詞,一般,*,*,*,アイウ","アイウ,名詞,普通名詞,一般,*,*,*,アイウ",,, -東東京都,6,8,6320,名詞,固有名詞,地名,一般,*,*,ヒガシヒガシキョウト,東東京都,,,,"東,名詞,普通名詞,一般,*,*,*,ヒガシ/東,名詞,普通名詞,一般,*,*,*,ヒガシ/京都,名詞,固有名詞,地名,一般,*,*,キョウト",, +IndexForm,LeftId,RightId,Cost,Headword,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Split_A,Split_B,Split_C,WordStructure,SynonymGroups +た,1,1,8729,,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,,,,,,, +に,2,2,11406,,助詞,接続助詞,*,*,*,*,ニ,,,,,,, +に,3,3,4481,,助詞,格助詞,*,*,*,*,ニ,,,,,,, +京都,6,6,5293,,名詞,固有名詞,地名,一般,*,*,キョウト,,,,,,,1/5 +東,7,7,4675,,名詞,普通名詞,一般,*,*,*,ヒガシ,,,,,,, +東京,6,6,2816,,名詞,固有名詞,地名,一般,*,*,トウキョウ,,,,,,, +東京都,6,8,5320,,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/都,名詞,普通名詞,一般,*,*,*,ト",,,"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/都,名詞,普通名詞,一般,*,*,*,ト", +行く,4,4,5105,,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,,,,,,, +行っ,5,5,5122,,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,"行く,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク",,,,, +都,8,8,2914,,名詞,普通名詞,一般,*,*,*,ト,,,,,,, +アイ,7,7,4675,,名詞,普通名詞,一般,*,*,*,アイ,,,,,,, +アイウ,7,7,4675,,名詞,普通名詞,一般,*,*,*,アイウ,,,,,,, +アイアイウ,6,6,32766,,名詞,固有名詞,地名,一般,*,*,アイアイウ,,,,,,, +0,9,9,2478,,名詞,数詞,*,*,*,*,ゼロ,,,,,,, +1,9,9,2478,,名詞,数詞,*,*,*,*,イチ,,,,,,, +2,9,9,2478,,名詞,数詞,*,*,*,*,ニ,,,,,,, +3,9,9,2478,,名詞,数詞,*,*,*,*,サン,,,,,,, +4,9,9,2478,,名詞,数詞,*,*,*,*,ヨン,,,,,,, +5,9,9,2478,,名詞,数詞,*,*,*,*,ゴ,,,,,,, +6,9,9,2478,,名詞,数詞,*,*,*,*,ロク,,,,,,, +7,9,9,2478,,名詞,数詞,*,*,*,*,ナナ,,,,,,, +8,9,9,2478,,名詞,数詞,*,*,*,*,ハチ,,,,,,, +9,9,9,2478,,名詞,数詞,*,*,*,*,キュウ,,,,,,, +〇,9,9,2478,,名詞,数詞,*,*,*,*,ゼロ,,,,,,, +一,9,9,2478,,名詞,数詞,*,*,*,*,イチ,,,,,,, +二,9,9,2478,,名詞,数詞,*,*,*,*,ニ,,,,,,, +三,9,9,2478,,名詞,数詞,*,*,*,*,サン,,,,,,, +四,9,9,2478,,名詞,数詞,*,*,*,*,ヨン,,,,,,, +五,9,9,2478,,名詞,数詞,*,*,*,*,ゴ,,,,,,, +六,9,9,2478,,名詞,数詞,*,*,*,*,ロク,,,,,,, +七,9,9,2478,,名詞,数詞,*,*,*,*,ナナ,,,,,,, +八,9,9,2478,,名詞,数詞,*,*,*,*,ハチ,,,,,,, +九,9,9,2478,,名詞,数詞,*,*,*,*,キュウ,,,,,,, +六三四,6,6,0,,名詞,固有名詞,地名,一般,*,*,ムサシ,,,,,,, +いく,4,4,5105,,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,,,,,, +いっ,5,5,5122,,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,"行く,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク","いく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク",,,,, +012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,-9000,,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,,,,,,, +特a,8,8,2914,特A,名詞,普通名詞,一般,*,*,*,トクエー,,,,,,, +隠し,-1,-1,0,,名詞,普通名詞,一般,*,*,*,カクシ,,,,,,, +な。な,8,8,2914,,名詞,普通名詞,一般,*,*,*,ナナ,,,"アイウ,名詞,普通名詞,一般,*,*,*,アイウ","アイウ,名詞,普通名詞,一般,*,*,*,アイウ",,, +東東京都,6,8,6320,,名詞,固有名詞,地名,一般,*,*,ヒガシヒガシキョウト,,,,,"東,名詞,普通名詞,一般,*,*,*,ヒガシ/東,名詞,普通名詞,一般,*,*,*,ヒガシ/京都,名詞,固有名詞,地名,一般,*,*,キョウト",, diff --git a/src/test/resources/dict/user.csv b/src/test/resources/dict/user.csv index d835d99e..5d8e2630 100644 --- a/src/test/resources/dict/user.csv +++ b/src/test/resources/dict/user.csv @@ -1,4 +1,4 @@ -Surface,LeftId,RightId,Cost,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Split_A,Split_B,WordStructure,SynonymGroups,UserData +IndexForm,LeftId,RightId,Cost,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Split_A,Split_B,WordStructure,SynonymGroups,UserData ぴらる,8,8,-32768,名詞,普通名詞,一般,*,*,*,ピラル,ぴらる,,,,,, 府,8,8,2914,名詞,普通名詞,一般,*,*,*,フ,府,,,,,, 東京府,6,6,2816,名詞,固有名詞,地名,一般,*,*,トウキョウフ,東京府,,"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/府,名詞,普通名詞,一般,*,*,*,フ",,"東京,名詞,固有名詞,地名,一般,*,*,トウキョウ/府,名詞,普通名詞,一般,*,*,*,フ",1/3, diff --git a/src/test/resources/dict/user2.csv b/src/test/resources/dict/user2.csv index fd0f6215..0912b476 100644 --- a/src/test/resources/dict/user2.csv +++ b/src/test/resources/dict/user2.csv @@ -1,3 +1,3 @@ -Surface,LeftId,RightId,Cost,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Split_A,Split_B,WordStructure,SynonymGroups +IndexForm,LeftId,RightId,Cost,POS1,POS2,POS3,POS4,POS5,POS6,Reading_Form,Normalized_Form,Dictionary_Form,Split_A,Split_B,WordStructure,SynonymGroups ぴさる,8,8,-32768,名詞,普通名詞,一般,*,*,*,ピサル,ぴさる,,,,, かぼす,6,6,2816,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,カボス,カボス,かぼす,,,,, From 95b791e2f7f89501720b26135a96f384af1d37c9 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 19 Nov 2024 13:48:20 +0900 Subject: [PATCH 93/94] improve doc for wordInfo.length and remove its unnecessary use. --- .../com/worksap/nlp/sudachi/JapaneseTokenizer.java | 2 +- .../com/worksap/nlp/sudachi/LatticeNodeImpl.java | 1 + .../com/worksap/nlp/sudachi/OovProviderPlugin.java | 12 ------------ .../com/worksap/nlp/sudachi/dictionary/WordInfo.java | 6 +++--- 4 files changed, 5 insertions(+), 16 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java index effc858b..67855b21 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java @@ -256,7 +256,7 @@ LatticeImpl buildLattice(UTF8InputText input) { private long provideOovs(OovProviderPlugin plugin, UTF8InputText input, ArrayList unkNodes, int boundary, long wordMask) { int initialSize = unkNodes.size(); - int created = plugin.getOOV(input, boundary, wordMask, unkNodes); + int created = plugin.provideOOV(input, boundary, wordMask, unkNodes); if (created == 0) { return wordMask; } diff --git a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java index e9fa22aa..255deb65 100644 --- a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java @@ -24,6 +24,7 @@ import java.util.Objects; public class LatticeNodeImpl implements LatticeNode { + // index of this node in the InuptText.bytes. int begin; int end; diff --git a/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java b/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java index 717ff94f..e4613415 100644 --- a/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java +++ b/src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java @@ -79,18 +79,6 @@ public void setUp(Grammar grammar) throws IOException { */ public abstract int provideOOV(InputText inputText, int offset, long otherWords, List result); - /** Runs provideOOV and set proper begin/end for each nodes. */ - int getOOV(UTF8InputText inputText, int offset, long otherWords, List result) { - int oldSize = result.size(); - int numCreated = provideOOV(inputText, offset, otherWords, result); - for (int i = 0; i < numCreated; i++) { - LatticeNodeImpl n = result.get(oldSize + i); - n.begin = offset; - n.end = offset + n.getWordInfo().getLength(); - } - return numCreated; - } - /** * @return throws an exception * @deprecated Use diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java index b7ad2d5b..8da239d6 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java @@ -92,14 +92,14 @@ public int getHeadword() { } /** - * Returns the length of the text in internal use unit. + * Returns the length of the word in internal use unit (InputText.getByteText). * *

- * This length is used to place a node in the + * This length is used to calculate indexes of split nodes in the * {@link com.worksap.nlp.sudachi.Lattice}, does not equals * {@code getHeadword().length()}. * - * @return the length of the text + * @return the byte length of the word */ public short getLength() { return indexFormLength; From 3f60988661b98225cf546cc81f58589a1bf39705 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 19 Nov 2024 14:18:52 +0900 Subject: [PATCH 94/94] improve comments and refactor a bit --- .../worksap/nlp/sudachi/LatticeNodeImpl.java | 52 +++++++++++++++---- .../java/com/worksap/nlp/sudachi/WordId.java | 1 + 2 files changed, 44 insertions(+), 9 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java index 255deb65..b6cff370 100644 --- a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java @@ -28,12 +28,13 @@ public class LatticeNodeImpl implements LatticeNode { int begin; int end; + int wordId; + + // word param/info that corresponds to wordId or that manually set + // (special/OOV). short leftId; short rightId; short cost; - - int wordId; - // word info that corresponds to wordId or that manually set (OOV). WordInfo wordInfo; // for lattice construction @@ -58,16 +59,25 @@ public class LatticeNodeImpl implements LatticeNode { LatticeNodeImpl() { } - /** Create special node with given wordid. */ + /** + * Create special node with given wordid. + * + * WordParameters should be set by the caller if needed. + */ static LatticeNodeImpl makeSpecial(int specialWordId) { assert WordId.isSpecial(specialWordId); LatticeNodeImpl node = new LatticeNodeImpl(); node.wordId = specialWordId; + node.setWordInfo(UNDEFINED_WORDINFO); return node; } - /** Create OOV node. */ - public static LatticeNodeImpl makeOov(int begin, int end, short posId, String surface, String normalizedForm, + /** + * Create OOV node. + * + * WordParameters should be set by the caller if needed. + */ + static LatticeNodeImpl makeOov(int begin, int end, short posId, String surface, String normalizedForm, String dictionaryForm, String readingForm) { LatticeNodeImpl node = new LatticeNodeImpl(); node.wordId = WordId.makeOov(posId); @@ -85,6 +95,14 @@ public void setParameter(short leftId, short rightId, short cost) { this.cost = cost; } + /** + * Set the parameters of connection. + * + * @param params + * packed parameters + * + * @see com.worksap.nlp.sudachi.dictionary.WordParameters + */ public void setParameter(long params) { this.leftId = WordParameters.leftId(params); this.rightId = WordParameters.rightId(params); @@ -139,9 +157,6 @@ public boolean isDefined() { @Override public WordInfo getWordInfo() { - if (isSpecial()) { - return UNDEFINED_WORDINFO; - } if (wordInfo != null) { return wordInfo; } @@ -259,6 +274,10 @@ private void appendSplitsTo(List result, int[] splitsId) { } } + /** + * Cache to reduce the access to the lexicon. Also used to mock the lexicon for + * OOV nodes. + */ private static final class StringsCache { private final Lexicon lexicon; private String surface; @@ -331,10 +350,15 @@ public String getDictionaryForm(LatticeNodeImpl node) { } } + /** Alias for {@link OOVFactory} constructor. */ public static OOVFactory oovFactory(short leftId, short rightId, short cost, short posId) { return new OOVFactory(leftId, rightId, cost, posId); } + /** + * Factory class for creating OOV LatticeNodeImpl with fixed word paramters and + * pos. + */ public static final class OOVFactory { private final short leftId; private final short rightId; @@ -348,11 +372,21 @@ private OOVFactory(short leftId, short rightId, short cost, short posId) { this.posId = posId; } + /** + * Create OOV LatticeNode at the given position of the input. + * + * The begin/end must be an index in InputText.bytes. + */ public LatticeNodeImpl make(int begin, int end, InputText input) { String s = input.getSubstring(begin, end); return make(begin, end, s); } + /** + * Create OOV LatticeNode at the given position and surface. + * + * The begin/end must be an index in InputText.bytes. + */ public LatticeNodeImpl make(int begin, int end, String text) { LatticeNodeImpl i = makeOov(begin, end, posId, text, text, text, text); i.setParameter(leftId, rightId, cost); diff --git a/src/main/java/com/worksap/nlp/sudachi/WordId.java b/src/main/java/com/worksap/nlp/sudachi/WordId.java index d54f9ea6..5967f40c 100644 --- a/src/main/java/com/worksap/nlp/sudachi/WordId.java +++ b/src/main/java/com/worksap/nlp/sudachi/WordId.java @@ -42,6 +42,7 @@ private WordId() { // ids for special tokens. public static final int ID_BOS = 0xffff_fff0; public static final int ID_EOS = 0xffff_fff1; + // id for oov without pos information public static final int ID_OOV_NOPOS = 0xf000_ffff; /**