From e058faf93dfc1ca6e1e693163efe52a6336b2b9a Mon Sep 17 00:00:00 2001 From: Espen Klem Date: Thu, 7 Mar 2019 15:15:27 +0100 Subject: [PATCH] Adding Swahili stopword list --- README.md | 1 + lib/stopword.js | 1 + lib/stopwords_he.js | 2 +- lib/stopwords_sw.js | 32 ++++++++++++++++++++++++++++++++ package-lock.json | 2 +- package.json | 2 +- test/test.js | 7 +++++++ 7 files changed, 44 insertions(+), 3 deletions(-) create mode 100644 lib/stopwords_sw.js diff --git a/README.md b/README.md index f3c1b2c..6e361b3 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,7 @@ Arrays of stopwords for the following languages are supplied: * `pa_in` - Punjabi Gurmukhi * `ru` - Russian * `sv` - Swedish +* `sw` - Swahili * `zh` - Chinese Simplified ```javascript diff --git a/lib/stopword.js b/lib/stopword.js index 25a072d..25e6f66 100644 --- a/lib/stopword.js +++ b/lib/stopword.js @@ -31,4 +31,5 @@ exports.pl = require('./stopwords_pl.js').words exports.pt = require('./stopwords_pt.js').words exports.ru = require('./stopwords_ru.js').words exports.sv = require('./stopwords_sv.js').words +exports.sw = require('./stopwords_sw.js').words exports.zh = require('./stopwords_zh.js').words diff --git a/lib/stopwords_he.js b/lib/stopwords_he.js index aabc651..81c5892 100644 --- a/lib/stopwords_he.js +++ b/lib/stopwords_he.js @@ -199,7 +199,7 @@ var words = [ "שם", "תהיה", "תחת" -]; +] exports.words = words diff --git a/lib/stopwords_sw.js b/lib/stopwords_sw.js new file mode 100644 index 0000000..6f7e826 --- /dev/null +++ b/lib/stopwords_sw.js @@ -0,0 +1,32 @@ +/* +The MIT License (MIT) + + +Frequency sorted +*/ + +// Swahili - Stop words list. A list of commonly used words that have little +// meaning and can be excluded from analysis. + +// This list is frequency sorted. That means it can be sliced from the bottom +// and be less agressive in excluding stopwords */ + +var words = ["na","ya","wa","kwa","ni","za","katika","la","kuwa","kama", + "kwamba","cha","hiyo","lakini","yake","hata","wakati","hivyo","sasa","wake", + "au","watu","hii","zaidi","vya","huo","tu","kwenye","si","pia","ili","moja", + "kila","baada","ambao","ambayo","yao","wao","kuna","hilo","kutoka","kubwa", + "pamoja","bila","huu","hayo","sana","ndani","mkuu","hizo","kufanya","wengi", + "hadi","mmoja","hili","juu","kwanza","wetu","kuhusu","baadhi","wote","yetu", + "hivi","kweli","mara","wengine","nini","ndiyo","zao","kati","hao","hapa", + "kutokana","muda","habari","ambaye","wenye","nyingine","hakuna","tena", + "hatua","bado","nafasi","basi","kabisa","hicho","nje","huyo","vile","yote", + "mkubwa","alikuwa","zote","leo","haya","huko","kutoa","mwa","kiasi","hasa", + "nyingi","kabla","wale","chini","gani","hapo","lazima","mwingine","bali", + "huku","zake","ilikuwa","tofauti","kupata","mbalimbali","pale","kusema", + "badala","wazi","yeye","alisema","hawa","ndio","hizi","tayari","wala", + "muhimu","ile","mpya","ambazo","dhidi","kwenda","sisi","kwani","jinsi", + "binafsi","kutumia","mbili","mbali","kuu","mengine","mbele","namna","mengi", + "upande"] + +exports.words = words + diff --git a/package-lock.json b/package-lock.json index bb082c0..a826e04 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,6 +1,6 @@ { "name": "stopword", - "version": "0.1.13", + "version": "0.1.14", "lockfileVersion": 1, "requires": true, "dependencies": { diff --git a/package.json b/package.json index 64279bd..f4a673e 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "stopword", - "version": "0.1.14", + "version": "0.1.15", "description": "A module for node.js that takes in text and returns text that is stripped of stopwords. Has pre-defined stopword lists for 20 languages and also takes lists with custom stopwords as input.", "main": "lib/stopword.js", "scripts": { diff --git a/test/test.js b/test/test.js index d85495d..a5531e2 100644 --- a/test/test.js +++ b/test/test.js @@ -145,6 +145,13 @@ describe('general stopwordiness:', function () { const newString = sw.removeStopwords(oldString, sw.my) newString.should.eql(['']) }) + + it('should remove swahili stopwords', function () { + const oldString = 'kila mtu anaweza kuhariri makala yoyote kutoa makosa ya lugha kutohoa maneno na kuendeleza na kukuza makala kwa kuandika kwa ufupi au kwa urefu'.split(' ') + const newString = sw.removeStopwords(oldString, sw.sw) + newString.should.eql(['mtu','anaweza','kuhariri','makala','yoyote','makosa','lugha','kutohoa','maneno','kuendeleza','kukuza','makala','kuandika','ufupi','urefu']) + }) + // Right to Left languages