diff --git a/data/dictionary/other/CharTable.bin.yes b/data/dictionary/other/CharTable.bin.yes index ec069a4f1..6044badbf 100644 Binary files a/data/dictionary/other/CharTable.bin.yes and b/data/dictionary/other/CharTable.bin.yes differ diff --git a/src/main/java/com/hankcs/hanlp/model/crf/CRFModel.java b/src/main/java/com/hankcs/hanlp/model/crf/CRFModel.java index 47ae50da0..c080e2815 100644 --- a/src/main/java/com/hankcs/hanlp/model/crf/CRFModel.java +++ b/src/main/java/com/hankcs/hanlp/model/crf/CRFModel.java @@ -250,6 +250,12 @@ public void tag(Table table) table.setLast(0, id2tag[maxTag]); } + /** + * 根据特征函数计算输出 + * @param table + * @param current + * @return + */ protected LinkedList computeScoreList(Table table, int current) { LinkedList scoreList = new LinkedList(); diff --git a/src/main/java/com/hankcs/hanlp/seg/CRF/CRFSegment.java b/src/main/java/com/hankcs/hanlp/seg/CRF/CRFSegment.java index 528c5c3ca..4a586abdf 100644 --- a/src/main/java/com/hankcs/hanlp/seg/CRF/CRFSegment.java +++ b/src/main/java/com/hankcs/hanlp/seg/CRF/CRFSegment.java @@ -16,18 +16,20 @@ import com.hankcs.hanlp.corpus.tag.Nature; import com.hankcs.hanlp.dictionary.CoreDictionary; import com.hankcs.hanlp.dictionary.CoreDictionaryTransformMatrixDictionary; -import com.hankcs.hanlp.dictionary.other.CharTable; import com.hankcs.hanlp.model.CRFSegmentModel; import com.hankcs.hanlp.model.crf.Table; -import com.hankcs.hanlp.model.trigram.CharacterBasedGenerativeModel; import com.hankcs.hanlp.seg.CharacterBasedGenerativeModelSegment; import com.hankcs.hanlp.seg.Segment; import com.hankcs.hanlp.seg.common.Term; import com.hankcs.hanlp.seg.common.Vertex; import com.hankcs.hanlp.utility.CharacterHelper; +import java.io.FileInputStream; +import java.io.ObjectInputStream; import java.util.*; +import static com.hankcs.hanlp.utility.Predefine.logger; + /** * 基于CRF的分词器 @@ -220,7 +222,7 @@ public static String[][] atomSegmentToTable(char[] sentence) sbAtom.setLength(0); --i; } - else if (CharacterHelper.isEnglishLetter(sentence[i])) + else if (CharacterHelper.isEnglishLetter(sentence[i]) || sentence[i] == ' ') { sbAtom.append(sentence[i]); if (i == maxLen) @@ -232,7 +234,7 @@ else if (CharacterHelper.isEnglishLetter(sentence[i])) break; } char c = sentence[++i]; - while (CharacterHelper.isEnglishLetter(c)) + while (CharacterHelper.isEnglishLetter(c) || sentence[i] == ' ') { sbAtom.append(sentence[i]); if (i == maxLen) @@ -282,4 +284,95 @@ public Segment enableNumberQuantifierRecognize(boolean enable) // enablePartOfSpeechTagging(enable); // return super.enableNumberQuantifierRecognize(enable); } + + /** + * 字符正规化表,相较于com/hankcs/hanlp/dictionary/other/CharTable.java,做了一些调整 + * @author hankcs + */ + static private class CharTable + { + /** + * 正规化使用的对应表 + */ + public static char[] CONVERT; + + static + { + long start = System.currentTimeMillis(); + try + { + ObjectInputStream in = new ObjectInputStream(new FileInputStream(HanLP.Config.CharTablePath)); + CONVERT = (char[]) in.readObject(); + in.close(); + } + catch (Exception e) + { + logger.severe("字符正规化表加载失败,原因如下:"); + e.printStackTrace(); + System.exit(-1); + } + // see https://github.com/hankcs/HanLP/issues/13 + CONVERT['“'] = '“'; + CONVERT['”'] = '”'; + CONVERT['.'] = '.'; + CONVERT['.'] = '.'; + CONVERT['。'] = ','; + CONVERT['!'] = ','; + CONVERT[','] = ','; + CONVERT['…'] = ','; + for (int i = 0; i < CONVERT.length; i++) + { + if (CONVERT[i] == '。') + CONVERT[i] = ','; + } + + logger.info("字符正规化表加载成功:" + (System.currentTimeMillis() - start) + " ms"); + } + + /** + * 将一个字符正规化 + * @param c 字符 + * @return 正规化后的字符 + */ + public static char convert(char c) + { + return CONVERT[c]; + } + + public static char[] convert(char[] charArray) + { + char[] result = new char[charArray.length]; + for (int i = 0; i < charArray.length; i++) + { + result[i] = CONVERT[charArray[i]]; + } + + return result; + } + + public static String convert(String charArray) + { + assert charArray != null; + char[] result = new char[charArray.length()]; + for (int i = 0; i < charArray.length(); i++) + { + result[i] = CONVERT[charArray.charAt(i)]; + } + + return new String(result); + } + + /** + * 正规化一些字符(原地正规化) + * @param charArray 字符 + */ + public static void normalization(char[] charArray) + { + assert charArray != null; + for (int i = 0; i < charArray.length; i++) + { + charArray[i] = CONVERT[charArray[i]]; + } + } + } } diff --git a/src/test/java/com/hankcs/test/corpus/TestCorpusLoader.java b/src/test/java/com/hankcs/test/corpus/TestCorpusLoader.java index 2849f527e..de72863c8 100644 --- a/src/test/java/com/hankcs/test/corpus/TestCorpusLoader.java +++ b/src/test/java/com/hankcs/test/corpus/TestCorpusLoader.java @@ -210,4 +210,29 @@ public void handle(Document document) }); } + + /** + * 有些引号不对 + * @throws Exception + */ + public void testFindQuote() throws Exception + { + CorpusLoader.walk("D:\\Doc\\语料库\\2014_hankcs\\", new CorpusLoader.Handler() + { + @Override + public void handle(Document document) + { + for (List wordList : document.getSimpleSentenceList()) + { + for (Word word : wordList) + { + if(word.value.length() > 1 && word.value.endsWith("\"")) + { + System.out.println(word); + } + } + } + } + }); + } } diff --git a/src/test/java/com/hankcs/test/seg/TestSegment.java b/src/test/java/com/hankcs/test/seg/TestSegment.java index 891833c1a..675116f23 100644 --- a/src/test/java/com/hankcs/test/seg/TestSegment.java +++ b/src/test/java/com/hankcs/test/seg/TestSegment.java @@ -255,11 +255,10 @@ public void testTryToCrashSegment() throws Exception public void testCRFSegment() throws Exception { -// HanLP.Config.enableDebug(); + HanLP.Config.enableDebug(); HanLP.Config.ShowTermNature = false; Segment segment = new CRFSegment(); - System.out.println(segment.seg("尼玛不是新词,王尼玛是新词")); - System.out.println(segment.seg("周杰伦在出品范特西之后,又出品了依然范特西")); + System.out.println(segment.seg("“碰瓷”后依靠罚球得分一直是哈登的主要得分手段")); } public void testIssue16() throws Exception