Skip to content

Commit

Permalink
Add open function in SegmentMapper and StopwordsRemoverMapper.
Browse files Browse the repository at this point in the history
See #94
  • Loading branch information
lqb11 authored and shaomeng.wang committed Jun 4, 2020
1 parent 502fa68 commit 2c1a195
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,14 @@
* Segment Chinese document into words.
*/
public class SegmentMapper extends SISOMapper {
private JiebaSegmenter segmentor;
private transient JiebaSegmenter segmentor = null;

public SegmentMapper(TableSchema dataSchema, Params params) {
super(dataSchema, params);
}

@Override
public void open(){
segmentor = new JiebaSegmenter();
String[] userDefinedDict = this.params.get(SegmentParams.USER_DEFINED_DICT);
if (null != userDefinedDict) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,21 @@
* English stop words are from https://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/.
*/
public class StopWordsRemoverMapper extends SISOMapper {
private HashSet<String> stopWordsSet;
private transient HashSet<String> stopWordsSet = null;
private final boolean caseSensitive;
private static final Logger LOG = LoggerFactory.getLogger(StopWordsRemoverMapper.class);

public StopWordsRemoverMapper(TableSchema dataSchema, Params params) {
super(dataSchema, params);
this.caseSensitive = this.params.get(StopWordsRemoverParams.CASE_SENSITIVE);
}

@Override
public void open(){
this.stopWordsSet = new HashSet<>();
String[] stopWords = this.params.get(StopWordsRemoverParams.STOP_WORDS);
if (null != stopWords) {
for(String stopWord : stopWords){
for (String stopWord : stopWords) {
stopWordsSet.add(caseSensitive ? stopWord : stopWord.toLowerCase());
}
}
Expand Down Expand Up @@ -74,7 +78,7 @@ protected Object mapColumn(Object input) {
StringBuilder sbd = new StringBuilder();
String[] tokens = content.split(NLPConstant.WORD_DELIMITER);
for (String token : tokens) {
if (stopWordsSet.contains(caseSensitive ? token : token.toLowerCase())) {
if (token.isEmpty() || this.stopWordsSet.contains(caseSensitive ? token : token.toLowerCase())) {
continue;
}
sbd.append(token).append(NLPConstant.WORD_DELIMITER);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ public void test1() throws Exception {
.set(SegmentParams.SELECTED_COL, "sentence");

SegmentMapper mapper = new SegmentMapper(schema, params);
mapper.open();

assertEquals(mapper.map(Row.of("我们辅助用户简单快速低成本低风险的实现系统权限安全管理")).getField(0),
"我们 辅助 用户 简单 快速 低成本 低 风险 的 实现 系统 权限 安全 管理");
Expand All @@ -36,6 +37,7 @@ public void test2() throws Exception {
.set(SegmentParams.USER_DEFINED_DICT, dictArray);

SegmentMapper mapper = new SegmentMapper(schema, params);
mapper.open();

assertEquals(mapper.map(Row.of("我们辅助用户简单快速低成本低风险的实现系统权限安全管理")).getField(0),
"我们 辅助 用户 简单 快速 低成本 低风险 的 实现 系统 权限 安全 管理");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ public void testStopWords() throws Exception {
.set(StopWordsRemoverParams.STOP_WORDS, new String[]{"Test"});

StopWordsRemoverMapper mapper = new StopWordsRemoverMapper(schema, params);
mapper.open();

assertEquals(mapper.map(Row.of("This is a unit test for filtering stopWords")).getField(0),
"unit filtering stopWords");
Expand All @@ -42,6 +43,7 @@ public void testCaseSensitive() throws Exception {
.set(StopWordsRemoverParams.STOP_WORDS, new String[]{"Test"});

StopWordsRemoverMapper mapper = new StopWordsRemoverMapper(schema, params);
mapper.open();

assertEquals(mapper.map(Row.of("This is a unit test for filtering stopWords")).getField(0),
"This unit test filtering stopWords");
Expand Down

0 comments on commit 2c1a195

Please sign in to comment.