Skip to content

Commit

Permalink
CRF分词字符正规化使用特化的CharTable,还原原CharTable,解决#55 (comment)
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Sep 9, 2015
1 parent 2f8d7fe commit 4990d1c
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 7 deletions.
Binary file modified data/dictionary/other/CharTable.bin.yes
Binary file not shown.
6 changes: 6 additions & 0 deletions src/main/java/com/hankcs/hanlp/model/crf/CRFModel.java
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,12 @@ public void tag(Table table)
table.setLast(0, id2tag[maxTag]);
}

/**
* 根据特征函数计算输出
* @param table
* @param current
* @return
*/
protected LinkedList<double[]> computeScoreList(Table table, int current)
{
LinkedList<double[]> scoreList = new LinkedList<double[]>();
Expand Down
101 changes: 97 additions & 4 deletions src/main/java/com/hankcs/hanlp/seg/CRF/CRFSegment.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,20 @@
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import com.hankcs.hanlp.dictionary.CoreDictionaryTransformMatrixDictionary;
import com.hankcs.hanlp.dictionary.other.CharTable;
import com.hankcs.hanlp.model.CRFSegmentModel;
import com.hankcs.hanlp.model.crf.Table;
import com.hankcs.hanlp.model.trigram.CharacterBasedGenerativeModel;
import com.hankcs.hanlp.seg.CharacterBasedGenerativeModelSegment;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.seg.common.Vertex;
import com.hankcs.hanlp.utility.CharacterHelper;

import java.io.FileInputStream;
import java.io.ObjectInputStream;
import java.util.*;

import static com.hankcs.hanlp.utility.Predefine.logger;


/**
* 基于CRF的分词器
Expand Down Expand Up @@ -220,7 +222,7 @@ public static String[][] atomSegmentToTable(char[] sentence)
sbAtom.setLength(0);
--i;
}
else if (CharacterHelper.isEnglishLetter(sentence[i]))
else if (CharacterHelper.isEnglishLetter(sentence[i]) || sentence[i] == ' ')
{
sbAtom.append(sentence[i]);
if (i == maxLen)
Expand All @@ -232,7 +234,7 @@ else if (CharacterHelper.isEnglishLetter(sentence[i]))
break;
}
char c = sentence[++i];
while (CharacterHelper.isEnglishLetter(c))
while (CharacterHelper.isEnglishLetter(c) || sentence[i] == ' ')
{
sbAtom.append(sentence[i]);
if (i == maxLen)
Expand Down Expand Up @@ -282,4 +284,95 @@ public Segment enableNumberQuantifierRecognize(boolean enable)
// enablePartOfSpeechTagging(enable);
// return super.enableNumberQuantifierRecognize(enable);
}

/**
* 字符正规化表,相较于com/hankcs/hanlp/dictionary/other/CharTable.java,做了一些调整
* @author hankcs
*/
static private class CharTable
{
/**
* 正规化使用的对应表
*/
public static char[] CONVERT;

static
{
long start = System.currentTimeMillis();
try
{
ObjectInputStream in = new ObjectInputStream(new FileInputStream(HanLP.Config.CharTablePath));
CONVERT = (char[]) in.readObject();
in.close();
}
catch (Exception e)
{
logger.severe("字符正规化表加载失败,原因如下:");
e.printStackTrace();
System.exit(-1);
}
// see https://github.com/hankcs/HanLP/issues/13
CONVERT['“'] = '“';
CONVERT['”'] = '”';
CONVERT['.'] = '.';
CONVERT['.'] = '.';
CONVERT['。'] = ',';
CONVERT['!'] = ',';
CONVERT[','] = ',';
CONVERT['…'] = ',';
for (int i = 0; i < CONVERT.length; i++)
{
if (CONVERT[i] == '。')
CONVERT[i] = ',';
}

logger.info("字符正规化表加载成功:" + (System.currentTimeMillis() - start) + " ms");
}

/**
* 将一个字符正规化
* @param c 字符
* @return 正规化后的字符
*/
public static char convert(char c)
{
return CONVERT[c];
}

public static char[] convert(char[] charArray)
{
char[] result = new char[charArray.length];
for (int i = 0; i < charArray.length; i++)
{
result[i] = CONVERT[charArray[i]];
}

return result;
}

public static String convert(String charArray)
{
assert charArray != null;
char[] result = new char[charArray.length()];
for (int i = 0; i < charArray.length(); i++)
{
result[i] = CONVERT[charArray.charAt(i)];
}

return new String(result);
}

/**
* 正规化一些字符(原地正规化)
* @param charArray 字符
*/
public static void normalization(char[] charArray)
{
assert charArray != null;
for (int i = 0; i < charArray.length; i++)
{
charArray[i] = CONVERT[charArray[i]];
}
}
}
}
25 changes: 25 additions & 0 deletions src/test/java/com/hankcs/test/corpus/TestCorpusLoader.java
Original file line number Diff line number Diff line change
Expand Up @@ -210,4 +210,29 @@ public void handle(Document document)
});

}

/**
* 有些引号不对
* @throws Exception
*/
public void testFindQuote() throws Exception
{
CorpusLoader.walk("D:\\Doc\\语料库\\2014_hankcs\\", new CorpusLoader.Handler()
{
@Override
public void handle(Document document)
{
for (List<Word> wordList : document.getSimpleSentenceList())
{
for (Word word : wordList)
{
if(word.value.length() > 1 && word.value.endsWith("\""))
{
System.out.println(word);
}
}
}
}
});
}
}
5 changes: 2 additions & 3 deletions src/test/java/com/hankcs/test/seg/TestSegment.java
Original file line number Diff line number Diff line change
Expand Up @@ -255,11 +255,10 @@ public void testTryToCrashSegment() throws Exception

public void testCRFSegment() throws Exception
{
// HanLP.Config.enableDebug();
HanLP.Config.enableDebug();
HanLP.Config.ShowTermNature = false;
Segment segment = new CRFSegment();
System.out.println(segment.seg("尼玛不是新词,王尼玛是新词"));
System.out.println(segment.seg("周杰伦在出品范特西之后,又出品了依然范特西"));
System.out.println(segment.seg("“碰瓷”后依靠罚球得分一直是哈登的主要得分手段"));
}

public void testIssue16() throws Exception
Expand Down

0 comments on commit 4990d1c

Please sign in to comment.