CRF分词字符正规化使用特化的CharTable，还原原CharTable，解决#55 (comment)

hankcs · Sep 9, 2015 · 4990d1c · 4990d1c
1 parent 2f8d7fe
commit 4990d1c
Show file tree

Hide file tree

Showing 5 changed files with 130 additions and 7 deletions.
diff --git a/data/dictionary/other/CharTable.bin.yes b/data/dictionary/other/CharTable.bin.yes
diff --git a/src/main/java/com/hankcs/hanlp/model/crf/CRFModel.java b/src/main/java/com/hankcs/hanlp/model/crf/CRFModel.java
@@ -250,6 +250,12 @@ public void tag(Table table)
         table.setLast(0, id2tag[maxTag]);
     }
 
+    /**
+     * 根据特征函数计算输出
+     * @param table
+     * @param current
+     * @return
+     */
     protected LinkedList<double[]> computeScoreList(Table table, int current)
     {
         LinkedList<double[]> scoreList = new LinkedList<double[]>();

diff --git a/src/main/java/com/hankcs/hanlp/seg/CRF/CRFSegment.java b/src/main/java/com/hankcs/hanlp/seg/CRF/CRFSegment.java
@@ -16,18 +16,20 @@
 import com.hankcs.hanlp.corpus.tag.Nature;
 import com.hankcs.hanlp.dictionary.CoreDictionary;
 import com.hankcs.hanlp.dictionary.CoreDictionaryTransformMatrixDictionary;
-import com.hankcs.hanlp.dictionary.other.CharTable;
 import com.hankcs.hanlp.model.CRFSegmentModel;
 import com.hankcs.hanlp.model.crf.Table;
-import com.hankcs.hanlp.model.trigram.CharacterBasedGenerativeModel;
 import com.hankcs.hanlp.seg.CharacterBasedGenerativeModelSegment;
 import com.hankcs.hanlp.seg.Segment;
 import com.hankcs.hanlp.seg.common.Term;
 import com.hankcs.hanlp.seg.common.Vertex;
 import com.hankcs.hanlp.utility.CharacterHelper;
 
+import java.io.FileInputStream;
+import java.io.ObjectInputStream;
 import java.util.*;
 
+import static com.hankcs.hanlp.utility.Predefine.logger;
+
 
 /**
  * 基于CRF的分词器
@@ -220,7 +222,7 @@ public static String[][] atomSegmentToTable(char[] sentence)
                 sbAtom.setLength(0);
                 --i;
             }
-            else if (CharacterHelper.isEnglishLetter(sentence[i]))
+            else if (CharacterHelper.isEnglishLetter(sentence[i]) || sentence[i] == ' ')
             {
                 sbAtom.append(sentence[i]);
                 if (i == maxLen)
@@ -232,7 +234,7 @@ else if (CharacterHelper.isEnglishLetter(sentence[i]))
                     break;
                 }
                 char c = sentence[++i];
-                while (CharacterHelper.isEnglishLetter(c))
+                while (CharacterHelper.isEnglishLetter(c) || sentence[i] == ' ')
                 {
                     sbAtom.append(sentence[i]);
                     if (i == maxLen)
@@ -282,4 +284,95 @@ public Segment enableNumberQuantifierRecognize(boolean enable)
 //        enablePartOfSpeechTagging(enable);
 //        return super.enableNumberQuantifierRecognize(enable);
     }
+
+    /**
+     * 字符正规化表，相较于com/hankcs/hanlp/dictionary/other/CharTable.java,做了一些调整
+     * @author hankcs
+     */
+    static private class CharTable
+    {
+        /**
+         * 正规化使用的对应表
+         */
+        public static char[] CONVERT;
+
+        static
+        {
+            long start = System.currentTimeMillis();
+            try
+            {
+                ObjectInputStream in = new ObjectInputStream(new FileInputStream(HanLP.Config.CharTablePath));
+                CONVERT = (char[]) in.readObject();
+                in.close();
+            }
+            catch (Exception e)
+            {
+                logger.severe("字符正规化表加载失败，原因如下：");
+                e.printStackTrace();
+                System.exit(-1);
+            }
+            // see https://github.com/hankcs/HanLP/issues/13
+            CONVERT['“'] = '“';
+            CONVERT['”'] = '”';
+            CONVERT['.'] = '.';
+            CONVERT['．'] = '.';
+            CONVERT['。'] = '，';
+            CONVERT['！'] = '，';
+            CONVERT['，'] = '，';
+            CONVERT['…'] = '，';
+            for (int i = 0; i < CONVERT.length; i++)
+            {
+                if (CONVERT[i] == '。')
+                    CONVERT[i] = '，';
+            }
+
+            logger.info("字符正规化表加载成功：" + (System.currentTimeMillis() - start) + " ms");
+        }
+
+        /**
+         * 将一个字符正规化
+         * @param c 字符
+         * @return 正规化后的字符
+         */
+        public static char convert(char c)
+        {
+            return CONVERT[c];
+        }
+
+        public static char[] convert(char[] charArray)
+        {
+            char[] result = new char[charArray.length];
+            for (int i = 0; i < charArray.length; i++)
+            {
+                result[i] = CONVERT[charArray[i]];
+            }
+
+            return result;
+        }
+
+        public static String convert(String charArray)
+        {
+            assert charArray != null;
+            char[] result = new char[charArray.length()];
+            for (int i = 0; i < charArray.length(); i++)
+            {
+                result[i] = CONVERT[charArray.charAt(i)];
+            }
+
+            return new String(result);
+        }
+
+        /**
+         * 正规化一些字符（原地正规化）
+         * @param charArray 字符
+         */
+        public static void normalization(char[] charArray)
+        {
+            assert charArray != null;
+            for (int i = 0; i < charArray.length; i++)
+            {
+                charArray[i] = CONVERT[charArray[i]];
+            }
+        }
+    }
 }
diff --git a/src/test/java/com/hankcs/test/corpus/TestCorpusLoader.java b/src/test/java/com/hankcs/test/corpus/TestCorpusLoader.java
@@ -210,4 +210,29 @@ public void handle(Document document)
         });
 
     }
+
+    /**
+     * 有些引号不对
+     * @throws Exception
+     */
+    public void testFindQuote() throws Exception
+    {
+        CorpusLoader.walk("D:\\Doc\\语料库\\2014_hankcs\\", new CorpusLoader.Handler()
+        {
+            @Override
+            public void handle(Document document)
+            {
+                for (List<Word> wordList : document.getSimpleSentenceList())
+                {
+                    for (Word word : wordList)
+                    {
+                        if(word.value.length() > 1 && word.value.endsWith("\""))
+                        {
+                            System.out.println(word);
+                        }
+                    }
+                }
+            }
+        });
+    }
 }
diff --git a/src/test/java/com/hankcs/test/seg/TestSegment.java b/src/test/java/com/hankcs/test/seg/TestSegment.java
@@ -255,11 +255,10 @@ public void testTryToCrashSegment() throws Exception
 
     public void testCRFSegment() throws Exception
     {
-//        HanLP.Config.enableDebug();
+        HanLP.Config.enableDebug();
         HanLP.Config.ShowTermNature = false;
         Segment segment = new CRFSegment();
-        System.out.println(segment.seg("尼玛不是新词，王尼玛是新词"));
-        System.out.println(segment.seg("周杰伦在出品范特西之后，又出品了依然范特西"));
+        System.out.println(segment.seg("“碰瓷”后依靠罚球得分一直是哈登的主要得分手段"));
     }
 
     public void testIssue16() throws Exception