Added spec for Brill POS trainer

NaturalNode · Mar 27, 2024 · c055e8f · c055e8f
1 parent b10efbe
commit c055e8f
Show file tree

Hide file tree

Showing 12 changed files with 619 additions and 7 deletions.
diff --git a/lib/natural/brill_pos_tagger/index.d.ts b/lib/natural/brill_pos_tagger/index.d.ts
@@ -33,7 +33,8 @@ declare interface RuleTemplatesItem {
   parameter2Values?: (sentence: Sentence, i: number) => string[]
 }
 
-export type RuleTemplates = Record<string, RuleTemplatesItem | undefined>
+export type RuleTemplates = Record<string, RuleTemplatesItem>
+export let ruleTemplates: RuleTemplates
 
 export class RuleTemplate {
   constructor (templateName: string, metadata: RuleTemplatesItem)
@@ -94,7 +95,7 @@ export class Lexicon {
 }
 
 declare class Corpus {
-  constructor (data: string | Corpus, typeOfCorpus: number, SentenceClass: typeof Sentence)
+  constructor (data: string | TaggedCorpus, typeOfCorpus: number, SentenceClass: typeof Sentence)
   private readonly wordCount: number
   private readonly sentences: Sentence[]
   private readonly tagFrequencies: Record<string, Record<string, number> | undefined>
@@ -117,6 +118,15 @@ declare interface BrillPOSTaggedWord {
   tag: string
 }
 
+declare interface BrillPOSTaggedSentence {
+  taggedWords: BrillPOSTaggedWord[]
+}
+
+declare interface TaggedCorpus {
+  wordCount: number
+  sentences: BrillPOSTaggedSentence[]
+}
+
 export class Sentence {
   constructor (data?: string[])
   taggedWords: BrillPOSTaggedWord[]
@@ -134,9 +144,6 @@ export class BrillPOSTagger {
 }
 
 export class BrillPOSTester {
-  constructor (lexicon: Lexicon, ruleSet: RuleSet)
-  private readonly lexicon: Lexicon
-  private readonly ruleSet: RuleSet
   test (corpus: Corpus, tagger: BrillPOSTagger): [number, number]
 }
 
@@ -161,7 +168,7 @@ export class BrillPOSTrainer {
   private scanForPositiveRules (): void
   private scanForSites (): void
   private neighbourhood (i: number, j: number): Array<[number, number]>
-  train (corpus: Corpus, templates: RuleTemplates, lexicon: Lexicon): RuleSet
+  train (corpus: Corpus, templates: RuleTemplate[], lexicon: Lexicon): RuleSet
   printRulesWithScores (): string
 }
 

diff --git a/lib/natural/brill_pos_tagger/index.js b/lib/natural/brill_pos_tagger/index.js
@@ -27,7 +27,7 @@ exports.BrillPOSTrainer = require('./lib/Brill_POS_Trainer')
 exports.BrillPOSTester = require('./lib/Brill_POS_Tester')
 exports.Lexicon = require('./lib/Lexicon')
 exports.RuleSet = require('./lib/RuleSet')
-exports.RuleTemplates = require('./lib/RuleTemplates')
+exports.ruleTemplates = require('./lib/RuleTemplates')
 exports.RuleTemplate = require('./lib/RuleTemplate')
 exports.Corpus = require('./lib/Corpus')
 exports.Sentence = require('./lib/Sentence')
diff --git a/ts_spec/aggressive_tokenizer_de_spec.ts b/ts_spec/aggressive_tokenizer_de_spec.ts
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2023, Hugo W.L. ter Doest
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+'use strict'
+
+import { AggressiveTokenizerDe } from 'lib/natural'
+const tokenizer = new AggressiveTokenizerDe()
+
+describe('aggressive_tokenizer', function () {
+  it('should tokenize strings with diacritics ä, ö and ü, and esszet ß', function () {
+    expect(tokenizer.tokenize('Es werden nur Maßnahmen gefördert, die nicht aufgrund einer Rechtsvorschrift umgesetzt werden müssen.')).toEqual(
+      ['Es', 'werden', 'nur', 'Maßnahmen', 'gefördert', 'die', 'nicht', 'aufgrund', 'einer',
+        'Rechtsvorschrift', 'umgesetzt', 'werden', 'müssen'])
+    expect(tokenizer.tokenize('Anträge sind vor Beginn der jeweiligen Maßnahme zu stellen.')).toEqual(
+      ['Anträge', 'sind', 'vor', 'Beginn', 'der', 'jeweiligen', 'Maßnahme', 'zu', 'stellen'])
+  })
+})
diff --git a/ts_spec/aggressive_tokenizer_es_spec.ts b/ts_spec/aggressive_tokenizer_es_spec.ts
@@ -0,0 +1,39 @@
+/*
+Copyright (c) 2011, Chris Umbel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this softwé and associated documentation files (the "Softwé"), to deal
+in the Softwé without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Softwé, and to permit persons to whom the Softwé is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Softwé.
+
+THE SOFTWé IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWé OR THE USE OR OTHER DEALINGS IN
+THE SOFTWé.
+*/
+
+'use strict'
+
+import { AggressiveTokenizerEs } from 'lib/natural'
+const tokenizer = new AggressiveTokenizerEs()
+
+describe('aggressive_tokenizer_es', function () {
+  it('should tokenize strings', function () {
+    expect(tokenizer.tokenize('hola yo me llamo eduardo y esudié ingeniería')).toEqual(['hola', 'yo', 'me', 'llamo', 'eduardo', 'y', 'esudié', 'ingeniería'])
+  })
+
+  /*
+  it('should tokenize strings via attached string method', function() {
+    tokenizer.attach();
+    expect('hola yo me llamo eduardo y esudié ingeniería'.tokenize()).toEqual(['hola', 'yo', 'me', 'llamo', 'eduardo', 'y', 'esudié', 'ingeniería']);
+  });
+  */
+})
diff --git a/ts_spec/aggressive_tokenizer_fr_spec.ts b/ts_spec/aggressive_tokenizer_fr_spec.ts
@@ -0,0 +1,111 @@
+/*
+Copyright (c) 2011, Chris Umbel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this softwé and associated documentation files (the "Softwé"), to deal
+in the Softwé without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Softwé, and to permit persons to whom the Softwé is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Softwé.
+
+THE SOFTWé IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWé OR THE USE OR OTHER DEALINGS IN
+THE SOFTWé.
+*/
+
+'use strict'
+
+import { AggressiveTokenizerFr } from 'lib/natural'
+const tokenizer = new AggressiveTokenizerFr()
+
+const text = "Affectueusement surnommé « Gabo » dans toute l'Amérique latine, le Colombien Gabriel Garcia Marquez, prix Nobel de littérature 1982, l'un des plus grands écrivains du XXe siècle, est mort À son domicile de Mexico jeudi 17 avril. Il était âgé de 87 ans. Son Œuvre a été traduite dans toutes les langues ou presque, et vendue à quelque 50 millions d'exemplaires."
+
+const tokenized = ['Affectueusement',
+  'surnommé',
+  'Gabo',
+  'dans',
+  'toute',
+  'l',
+  'Amérique',
+  'latine',
+  'le',
+  'Colombien',
+  'Gabriel',
+  'Garcia',
+  'Marquez',
+  'prix',
+  'Nobel',
+  'de',
+  'littérature',
+  '1982',
+  'l',
+  'un',
+  'des',
+  'plus',
+  'grands',
+  'écrivains',
+  'du',
+  'XXe',
+  'siècle',
+  'est',
+  'mort',
+  'À',
+  'son',
+  'domicile',
+  'de',
+  'Mexico',
+  'jeudi',
+  '17',
+  'avril',
+  'Il',
+  'était',
+  'âgé',
+  'de',
+  '87',
+  'ans',
+  'Son',
+  'Œuvre',
+  'a',
+  'été',
+  'traduite',
+  'dans',
+  'toutes',
+  'les',
+  'langues',
+  'ou',
+  'presque',
+  'et',
+  'vendue',
+  'à',
+  'quelque',
+  '50',
+  'millions',
+  'd',
+  'exemplaires']
+
+describe('aggressive_tokenizer_fr', function () {
+  it('should tokenize strings', function () {
+    expect(tokenizer.tokenize(text)).toEqual(tokenized)
+  })
+
+  it('should handle hyphens in words correctly', function () {
+    const sentence = 'Des sous-pages dans le sous-bois de la ville de Paris'
+    const res = tokenizer.tokenize(sentence)
+    const expectedRes = ['Des', 'sous-pages', 'dans', 'le', 'sous-bois', 'de', 'la', 'ville', 'de', 'Paris']
+    expect(res).toEqual(expectedRes)
+  })
+
+  /*
+  it('should tokenize strings via attached string method', function() {
+    tokenizer.attach();
+    expect(text.tokenize()).toEqual(tokenized);
+  });
+  */
+})
diff --git a/ts_spec/aggressive_tokenizer_hi_spec.ts b/ts_spec/aggressive_tokenizer_hi_spec.ts
@@ -0,0 +1,70 @@
+/*
+file aggressive_tokenizer_hi_spec.js , located at spec\aggressive_tokenizer_hi_spec.js is licensed as follows:
+- (The MIT License)
+- Copyright (c) 2023 Mukesh Singh Bisht
+
+Permission is hereby granted, free of charge, to any person or entity obtaining a copy
+of  file aggressive_tokenizer_hi_spec.js and its content(the "Software"), to deal in the
+Software without restriction, including without limitation the rights to use, copy,
+modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+and to permit persons to whom the Software is furnished to do so, subject to the following
+conditions:
+
+1. The above copyright notice and this permission notice shall be included in all copies
+   or substantial portions of the Software.
+
+2. Proper credit must be given to the original author Mukesh Singh Bisht, along with the
+   date of authorship specified as July 23, 2023, in any usage, distribution, or
+   modification of the Software.
+
+THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
+PURPOSE, AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
+FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT, OR
+OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+'use strict'
+
+import { AggressiveTokenizerHi } from 'lib/natural'
+const tokenizer = new AggressiveTokenizerHi()
+
+describe('aggressive_tokenizer_hi', function () {
+  it('should tokenize strings', function () {
+    const string = 'स्वतंत्रता दिवस की हार्दिक शुभकामनाएं'
+    const expectedArray = ['स्वतंत्रता', 'दिवस', 'की', 'हार्दिक', 'शुभकामनाएं']
+    expect(tokenizer.tokenize(string)).toEqual(expectedArray)
+  })
+  it('should tokenize strings including english words', function () {
+    const string = 'स्वतंत्रता दिवस की हार्दिक शुभकामनाएं congrats mukesh'
+    const expectedArray = ['स्वतंत्रता', 'दिवस', 'की', 'हार्दिक', 'शुभकामनाएं', 'congrats', 'mukesh']
+    expect(tokenizer.tokenize(string)).toEqual(expectedArray)
+  })
+
+  it('should swallow viram(stop) symbols', function () {
+    const string = 'स्वतंत्रता दिवस की हार्दिक शुभकामनाएं । congrats mukesh'
+    const expectedArray = ['स्वतंत्रता', 'दिवस', 'की', 'हार्दिक', 'शुभकामनाएं', 'congrats', 'mukesh']
+    expect(tokenizer.tokenize(string)).toEqual(expectedArray)
+  })
+  it('should swallow dirgh viram(stop) symbols', function () {
+    const string = 'राजा बाजीराव ॥ '
+    const expectedArray = ['राजा', 'बाजीराव']
+    expect(tokenizer.tokenize(string)).toEqual(expectedArray)
+  })
+  it('should swallow trailing off symbols', function () {
+    const string = 'राजा बाजीराव ...'
+    const expectedArray = ['राजा', 'बाजीराव']
+    expect(tokenizer.tokenize(string)).toEqual(expectedArray)
+  })
+  it('should swallow question mark symbols', function () {
+    const string = 'राजा बाजीराव ?'
+    const expectedArray = ['राजा', 'बाजीराव']
+    expect(tokenizer.tokenize(string)).toEqual(expectedArray)
+  })
+  it('should swallow comma mark symbols', function () {
+    const string = 'राजा, बाजीराव'
+    const expectedArray = ['राजा', 'बाजीराव']
+    expect(tokenizer.tokenize(string)).toEqual(expectedArray)
+  })
+})
diff --git a/ts_spec/aggressive_tokenizer_nl_spec.ts b/ts_spec/aggressive_tokenizer_nl_spec.ts
@@ -0,0 +1,39 @@
+/*
+Copyright (c) 2011, Chris Umbel, Martijn de Boer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this softwé and associated documentation files (the "Softwé"), to deal
+in the Softwé without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Softwé, and to permit persons to whom the Softwé is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Softwé.
+
+THE SOFTWé IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWé OR THE USE OR OTHER DEALINGS IN
+THE SOFTWé.
+*/
+
+'use strict'
+
+import { AggressiveTokenizerNl } from 'lib/natural'
+const tokenizer = new AggressiveTokenizerNl()
+
+describe('aggressive_tokenizer_nl', function () {
+  it('should tokenize strings', function () {
+    expect(tokenizer.tokenize('\'s Morgens is het nog erg koud, vertelde de weerman over een van de radio\'s')).toEqual(['\'s', 'Morgens', 'is', 'het', 'nog', 'erg', 'koud', 'vertelde', 'de', 'weerman', 'over', 'een', 'van', 'de', 'radio\'s'])
+  })
+
+  it('should handle hyphens in words correctly', function () {
+    const sentence = 'clearing-systeem front-office-automatisering christelijk-historisch mond-op-mond, kant-en-klaar, kruidje-roer-me-niet, doe-het-zelver'
+    const res = tokenizer.tokenize(sentence)
+    const expectedRes = ['clearing-systeem', 'front-office-automatisering', 'christelijk-historisch', 'mond-op-mond', 'kant-en-klaar', 'kruidje-roer-me-niet', 'doe-het-zelver']
+    expect(res).toEqual(expectedRes)
+  })
+})