Skip to content

Commit

Permalink
Adding Swahili stopword list
Browse files Browse the repository at this point in the history
  • Loading branch information
eklem committed Mar 7, 2019
1 parent 96b1d8f commit e058faf
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 3 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ Arrays of stopwords for the following languages are supplied:
* `pa_in` - Punjabi Gurmukhi
* `ru` - Russian
* `sv` - Swedish
* `sw` - Swahili
* `zh` - Chinese Simplified
```javascript
Expand Down
1 change: 1 addition & 0 deletions lib/stopword.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,5 @@ exports.pl = require('./stopwords_pl.js').words
exports.pt = require('./stopwords_pt.js').words
exports.ru = require('./stopwords_ru.js').words
exports.sv = require('./stopwords_sv.js').words
exports.sw = require('./stopwords_sw.js').words
exports.zh = require('./stopwords_zh.js').words
2 changes: 1 addition & 1 deletion lib/stopwords_he.js
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ var words = [
"שם",
"תהיה",
"תחת"
];
]

exports.words = words

32 changes: 32 additions & 0 deletions lib/stopwords_sw.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
The MIT License (MIT)
Frequency sorted
*/

// Swahili - Stop words list. A list of commonly used words that have little
// meaning and can be excluded from analysis.

// This list is frequency sorted. That means it can be sliced from the bottom
// and be less agressive in excluding stopwords */

var words = ["na","ya","wa","kwa","ni","za","katika","la","kuwa","kama",
"kwamba","cha","hiyo","lakini","yake","hata","wakati","hivyo","sasa","wake",
"au","watu","hii","zaidi","vya","huo","tu","kwenye","si","pia","ili","moja",
"kila","baada","ambao","ambayo","yao","wao","kuna","hilo","kutoka","kubwa",
"pamoja","bila","huu","hayo","sana","ndani","mkuu","hizo","kufanya","wengi",
"hadi","mmoja","hili","juu","kwanza","wetu","kuhusu","baadhi","wote","yetu",
"hivi","kweli","mara","wengine","nini","ndiyo","zao","kati","hao","hapa",
"kutokana","muda","habari","ambaye","wenye","nyingine","hakuna","tena",
"hatua","bado","nafasi","basi","kabisa","hicho","nje","huyo","vile","yote",
"mkubwa","alikuwa","zote","leo","haya","huko","kutoa","mwa","kiasi","hasa",
"nyingi","kabla","wale","chini","gani","hapo","lazima","mwingine","bali",
"huku","zake","ilikuwa","tofauti","kupata","mbalimbali","pale","kusema",
"badala","wazi","yeye","alisema","hawa","ndio","hizi","tayari","wala",
"muhimu","ile","mpya","ambazo","dhidi","kwenda","sisi","kwani","jinsi",
"binafsi","kutumia","mbili","mbali","kuu","mengine","mbele","namna","mengi",
"upande"]

exports.words = words

2 changes: 1 addition & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "stopword",
"version": "0.1.14",
"version": "0.1.15",
"description": "A module for node.js that takes in text and returns text that is stripped of stopwords. Has pre-defined stopword lists for 20 languages and also takes lists with custom stopwords as input.",
"main": "lib/stopword.js",
"scripts": {
Expand Down
7 changes: 7 additions & 0 deletions test/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,13 @@ describe('general stopwordiness:', function () {
const newString = sw.removeStopwords(oldString, sw.my)
newString.should.eql([''])
})

it('should remove swahili stopwords', function () {
const oldString = 'kila mtu anaweza kuhariri makala yoyote kutoa makosa ya lugha kutohoa maneno na kuendeleza na kukuza makala kwa kuandika kwa ufupi au kwa urefu'.split(' ')
const newString = sw.removeStopwords(oldString, sw.sw)
newString.should.eql(['mtu','anaweza','kuhariri','makala','yoyote','makosa','lugha','kutohoa','maneno','kuendeleza','kukuza','makala','kuandika','ufupi','urefu'])
})



// Right to Left languages
Expand Down

0 comments on commit e058faf

Please sign in to comment.