-
Notifications
You must be signed in to change notification settings - Fork 860
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b10efbe
commit c055e8f
Showing
12 changed files
with
619 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
/* | ||
Copyright (c) 2023, Hugo W.L. ter Doest | ||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
The above copyright notice and this permission notice shall be included in | ||
all copies or substantial portions of the Software. | ||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
THE SOFTWARE. | ||
*/ | ||
|
||
'use strict' | ||
|
||
import { AggressiveTokenizerDe } from 'lib/natural' | ||
const tokenizer = new AggressiveTokenizerDe() | ||
|
||
describe('aggressive_tokenizer', function () { | ||
it('should tokenize strings with diacritics ä, ö and ü, and esszet ß', function () { | ||
expect(tokenizer.tokenize('Es werden nur Maßnahmen gefördert, die nicht aufgrund einer Rechtsvorschrift umgesetzt werden müssen.')).toEqual( | ||
['Es', 'werden', 'nur', 'Maßnahmen', 'gefördert', 'die', 'nicht', 'aufgrund', 'einer', | ||
'Rechtsvorschrift', 'umgesetzt', 'werden', 'müssen']) | ||
expect(tokenizer.tokenize('Anträge sind vor Beginn der jeweiligen Maßnahme zu stellen.')).toEqual( | ||
['Anträge', 'sind', 'vor', 'Beginn', 'der', 'jeweiligen', 'Maßnahme', 'zu', 'stellen']) | ||
}) | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
/* | ||
Copyright (c) 2011, Chris Umbel | ||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this softwé and associated documentation files (the "Softwé"), to deal | ||
in the Softwé without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Softwé, and to permit persons to whom the Softwé is | ||
furnished to do so, subject to the following conditions: | ||
The above copyright notice and this permission notice shall be included in | ||
all copies or substantial portions of the Softwé. | ||
THE SOFTWé IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWé OR THE USE OR OTHER DEALINGS IN | ||
THE SOFTWé. | ||
*/ | ||
|
||
'use strict' | ||
|
||
import { AggressiveTokenizerEs } from 'lib/natural' | ||
const tokenizer = new AggressiveTokenizerEs() | ||
|
||
describe('aggressive_tokenizer_es', function () { | ||
it('should tokenize strings', function () { | ||
expect(tokenizer.tokenize('hola yo me llamo eduardo y esudié ingeniería')).toEqual(['hola', 'yo', 'me', 'llamo', 'eduardo', 'y', 'esudié', 'ingeniería']) | ||
}) | ||
|
||
/* | ||
it('should tokenize strings via attached string method', function() { | ||
tokenizer.attach(); | ||
expect('hola yo me llamo eduardo y esudié ingeniería'.tokenize()).toEqual(['hola', 'yo', 'me', 'llamo', 'eduardo', 'y', 'esudié', 'ingeniería']); | ||
}); | ||
*/ | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
/* | ||
Copyright (c) 2011, Chris Umbel | ||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this softwé and associated documentation files (the "Softwé"), to deal | ||
in the Softwé without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Softwé, and to permit persons to whom the Softwé is | ||
furnished to do so, subject to the following conditions: | ||
The above copyright notice and this permission notice shall be included in | ||
all copies or substantial portions of the Softwé. | ||
THE SOFTWé IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWé OR THE USE OR OTHER DEALINGS IN | ||
THE SOFTWé. | ||
*/ | ||
|
||
'use strict' | ||
|
||
import { AggressiveTokenizerFr } from 'lib/natural' | ||
const tokenizer = new AggressiveTokenizerFr() | ||
|
||
const text = "Affectueusement surnommé « Gabo » dans toute l'Amérique latine, le Colombien Gabriel Garcia Marquez, prix Nobel de littérature 1982, l'un des plus grands écrivains du XXe siècle, est mort À son domicile de Mexico jeudi 17 avril. Il était âgé de 87 ans. Son Œuvre a été traduite dans toutes les langues ou presque, et vendue à quelque 50 millions d'exemplaires." | ||
|
||
const tokenized = ['Affectueusement', | ||
'surnommé', | ||
'Gabo', | ||
'dans', | ||
'toute', | ||
'l', | ||
'Amérique', | ||
'latine', | ||
'le', | ||
'Colombien', | ||
'Gabriel', | ||
'Garcia', | ||
'Marquez', | ||
'prix', | ||
'Nobel', | ||
'de', | ||
'littérature', | ||
'1982', | ||
'l', | ||
'un', | ||
'des', | ||
'plus', | ||
'grands', | ||
'écrivains', | ||
'du', | ||
'XXe', | ||
'siècle', | ||
'est', | ||
'mort', | ||
'À', | ||
'son', | ||
'domicile', | ||
'de', | ||
'Mexico', | ||
'jeudi', | ||
'17', | ||
'avril', | ||
'Il', | ||
'était', | ||
'âgé', | ||
'de', | ||
'87', | ||
'ans', | ||
'Son', | ||
'Œuvre', | ||
'a', | ||
'été', | ||
'traduite', | ||
'dans', | ||
'toutes', | ||
'les', | ||
'langues', | ||
'ou', | ||
'presque', | ||
'et', | ||
'vendue', | ||
'à', | ||
'quelque', | ||
'50', | ||
'millions', | ||
'd', | ||
'exemplaires'] | ||
|
||
describe('aggressive_tokenizer_fr', function () { | ||
it('should tokenize strings', function () { | ||
expect(tokenizer.tokenize(text)).toEqual(tokenized) | ||
}) | ||
|
||
it('should handle hyphens in words correctly', function () { | ||
const sentence = 'Des sous-pages dans le sous-bois de la ville de Paris' | ||
const res = tokenizer.tokenize(sentence) | ||
const expectedRes = ['Des', 'sous-pages', 'dans', 'le', 'sous-bois', 'de', 'la', 'ville', 'de', 'Paris'] | ||
expect(res).toEqual(expectedRes) | ||
}) | ||
|
||
/* | ||
it('should tokenize strings via attached string method', function() { | ||
tokenizer.attach(); | ||
expect(text.tokenize()).toEqual(tokenized); | ||
}); | ||
*/ | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
/* | ||
file aggressive_tokenizer_hi_spec.js , located at spec\aggressive_tokenizer_hi_spec.js is licensed as follows: | ||
- (The MIT License) | ||
- Copyright (c) 2023 Mukesh Singh Bisht | ||
Permission is hereby granted, free of charge, to any person or entity obtaining a copy | ||
of file aggressive_tokenizer_hi_spec.js and its content(the "Software"), to deal in the | ||
Software without restriction, including without limitation the rights to use, copy, | ||
modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, | ||
and to permit persons to whom the Software is furnished to do so, subject to the following | ||
conditions: | ||
1. The above copyright notice and this permission notice shall be included in all copies | ||
or substantial portions of the Software. | ||
2. Proper credit must be given to the original author Mukesh Singh Bisht, along with the | ||
date of authorship specified as July 23, 2023, in any usage, distribution, or | ||
modification of the Software. | ||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | ||
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR | ||
PURPOSE, AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE | ||
FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT, OR | ||
OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | ||
DEALINGS IN THE SOFTWARE. | ||
*/ | ||
|
||
'use strict' | ||
|
||
import { AggressiveTokenizerHi } from 'lib/natural' | ||
const tokenizer = new AggressiveTokenizerHi() | ||
|
||
describe('aggressive_tokenizer_hi', function () { | ||
it('should tokenize strings', function () { | ||
const string = 'स्वतंत्रता दिवस की हार्दिक शुभकामनाएं' | ||
const expectedArray = ['स्वतंत्रता', 'दिवस', 'की', 'हार्दिक', 'शुभकामनाएं'] | ||
expect(tokenizer.tokenize(string)).toEqual(expectedArray) | ||
}) | ||
it('should tokenize strings including english words', function () { | ||
const string = 'स्वतंत्रता दिवस की हार्दिक शुभकामनाएं congrats mukesh' | ||
const expectedArray = ['स्वतंत्रता', 'दिवस', 'की', 'हार्दिक', 'शुभकामनाएं', 'congrats', 'mukesh'] | ||
expect(tokenizer.tokenize(string)).toEqual(expectedArray) | ||
}) | ||
|
||
it('should swallow viram(stop) symbols', function () { | ||
const string = 'स्वतंत्रता दिवस की हार्दिक शुभकामनाएं । congrats mukesh' | ||
const expectedArray = ['स्वतंत्रता', 'दिवस', 'की', 'हार्दिक', 'शुभकामनाएं', 'congrats', 'mukesh'] | ||
expect(tokenizer.tokenize(string)).toEqual(expectedArray) | ||
}) | ||
it('should swallow dirgh viram(stop) symbols', function () { | ||
const string = 'राजा बाजीराव ॥ ' | ||
const expectedArray = ['राजा', 'बाजीराव'] | ||
expect(tokenizer.tokenize(string)).toEqual(expectedArray) | ||
}) | ||
it('should swallow trailing off symbols', function () { | ||
const string = 'राजा बाजीराव ...' | ||
const expectedArray = ['राजा', 'बाजीराव'] | ||
expect(tokenizer.tokenize(string)).toEqual(expectedArray) | ||
}) | ||
it('should swallow question mark symbols', function () { | ||
const string = 'राजा बाजीराव ?' | ||
const expectedArray = ['राजा', 'बाजीराव'] | ||
expect(tokenizer.tokenize(string)).toEqual(expectedArray) | ||
}) | ||
it('should swallow comma mark symbols', function () { | ||
const string = 'राजा, बाजीराव' | ||
const expectedArray = ['राजा', 'बाजीराव'] | ||
expect(tokenizer.tokenize(string)).toEqual(expectedArray) | ||
}) | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
/* | ||
Copyright (c) 2011, Chris Umbel, Martijn de Boer | ||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this softwé and associated documentation files (the "Softwé"), to deal | ||
in the Softwé without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Softwé, and to permit persons to whom the Softwé is | ||
furnished to do so, subject to the following conditions: | ||
The above copyright notice and this permission notice shall be included in | ||
all copies or substantial portions of the Softwé. | ||
THE SOFTWé IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWé OR THE USE OR OTHER DEALINGS IN | ||
THE SOFTWé. | ||
*/ | ||
|
||
'use strict' | ||
|
||
import { AggressiveTokenizerNl } from 'lib/natural' | ||
const tokenizer = new AggressiveTokenizerNl() | ||
|
||
describe('aggressive_tokenizer_nl', function () { | ||
it('should tokenize strings', function () { | ||
expect(tokenizer.tokenize('\'s Morgens is het nog erg koud, vertelde de weerman over een van de radio\'s')).toEqual(['\'s', 'Morgens', 'is', 'het', 'nog', 'erg', 'koud', 'vertelde', 'de', 'weerman', 'over', 'een', 'van', 'de', 'radio\'s']) | ||
}) | ||
|
||
it('should handle hyphens in words correctly', function () { | ||
const sentence = 'clearing-systeem front-office-automatisering christelijk-historisch mond-op-mond, kant-en-klaar, kruidje-roer-me-niet, doe-het-zelver' | ||
const res = tokenizer.tokenize(sentence) | ||
const expectedRes = ['clearing-systeem', 'front-office-automatisering', 'christelijk-historisch', 'mond-op-mond', 'kant-en-klaar', 'kruidje-roer-me-niet', 'doe-het-zelver'] | ||
expect(res).toEqual(expectedRes) | ||
}) | ||
}) |
Oops, something went wrong.