Skip to content

Commit

Permalink
新增一个提高用户词典优先级的开关:#633
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Sep 23, 2017
1 parent 5a5a056 commit c57895f
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,12 @@
package com.hankcs.hanlp.dictionary;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.corpus.io.ByteArray;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.seg.common.Vertex;
import com.hankcs.hanlp.utility.Predefine;

import java.io.*;
import java.util.Arrays;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.logging.Level;

Expand Down Expand Up @@ -271,9 +269,14 @@ public static int getBiFrequency(String a, String b)
*/
public static int getBiFrequency(int idA, int idB)
{
if (idA == -1 || idB == -1)
// 负数id表示来自用户词典的词语的词频(用户自定义词语没有id),返回正值增加其亲和度
if (idA < 0)
{
return 1000; // -1表示用户词典,返回正值增加其亲和度
return -idA;
}
if (idB < 0)
{
return -idB;
}
int index = binarySearch(pair, start[idA], start[idA + 1] - start[idA], idB);
if (index < 0) return 0;
Expand Down
4 changes: 4 additions & 0 deletions src/main/java/com/hankcs/hanlp/seg/Config.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ public class Config
* 是否加载用户词典
*/
public boolean useCustomDictionary = true;
/**
* 用户词典高优先级
*/
public boolean forceCustomDictionary = false;
/**
* 词性标注
*/
Expand Down
16 changes: 16 additions & 0 deletions src/main/java/com/hankcs/hanlp/seg/Segment.java
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,22 @@ public Segment enableCustomDictionary(boolean enable)
return this;
}

/**
* 是否尽可能强制使用用户词典(使用户词典的优先级尽可能高)<br>
* 警告:具体实现由各子类决定,可能会破坏分词器的统计特性(例如,如果用户词典
* 含有“和服”,则“商品和服务”的分词结果可能会被用户词典的高优先级影响)。
* @param enable
* @return 分词器本身
*
* @since 1.3.5
*/
public Segment enableCustomDictionaryForcing(boolean enable)
{
enableCustomDictionary(true);
config.forceCustomDictionary = enable;
return this;
}

/**
* 是否启用音译人名识别
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
package com.hankcs.hanlp.seg;

import com.hankcs.hanlp.algorithm.Viterbi;
import com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie;
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.*;
Expand Down Expand Up @@ -434,15 +435,18 @@ protected void GenerateWordNet(final WordNet wordNetStorage)
{
wordNetStorage.add(searcher.begin + 1, new Vertex(new String(charArray, searcher.begin, searcher.length), searcher.value, searcher.index));
}
// 用户词典查询
// if (config.useCustomDictionary)
// {
// searcher = CustomDictionary.dat.getSearcher(charArray, 0);
// while (searcher.next())
// {
// wordNetStorage.add(searcher.begin + 1, new Vertex(new String(charArray, searcher.begin, searcher.length), searcher.value));
// }
// }
// 强制用户词典查询
if (config.forceCustomDictionary)
{
CustomDictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>()
{
@Override
public void hit(int begin, int end, CoreDictionary.Attribute value)
{
wordNetStorage.add(begin + 1, new Vertex(new String(charArray, begin, end - begin), value));
}
});
}
// 原子分词,保证图连通
LinkedList<Vertex>[] vertexes = wordNetStorage.getVertexes();
for (int i = 1; i < vertexes.length; )
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/com/hankcs/hanlp/seg/common/Vertex.java
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ public void updateFrom(Vertex from)
*/
public Vertex(String word, String realWord, CoreDictionary.Attribute attribute)
{
this(word, realWord, attribute, -1);
this(word, realWord, attribute, -attribute.totalFrequency);
}

public Vertex(String word, String realWord, CoreDictionary.Attribute attribute, int wordID)
Expand Down
7 changes: 7 additions & 0 deletions src/test/java/com/hankcs/test/seg/TestSegment.java
Original file line number Diff line number Diff line change
Expand Up @@ -443,4 +443,11 @@ public void testIssue542() throws Exception
seg.enableNumberQuantifierRecognize(true);
System.out.println(seg.seg("一分钟就累了"));
}

public void testIssue633() throws Exception
{
CustomDictionary.add("钱管家");
StandardTokenizer.SEGMENT.enableCustomDictionaryForcing(true);
System.out.println(HanLP.segment("钱管家中怎么绑定网银"));
}
}

0 comments on commit c57895f

Please sign in to comment.