Skip to content

Commit

Permalink
Added spec for Brill POS trainer
Browse files Browse the repository at this point in the history
  • Loading branch information
Hugo-ter-Doest committed Mar 27, 2024
1 parent b10efbe commit c055e8f
Show file tree
Hide file tree
Showing 12 changed files with 619 additions and 7 deletions.
19 changes: 13 additions & 6 deletions lib/natural/brill_pos_tagger/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ declare interface RuleTemplatesItem {
parameter2Values?: (sentence: Sentence, i: number) => string[]
}

export type RuleTemplates = Record<string, RuleTemplatesItem | undefined>
export type RuleTemplates = Record<string, RuleTemplatesItem>
export let ruleTemplates: RuleTemplates

export class RuleTemplate {
constructor (templateName: string, metadata: RuleTemplatesItem)
Expand Down Expand Up @@ -94,7 +95,7 @@ export class Lexicon {
}

declare class Corpus {
constructor (data: string | Corpus, typeOfCorpus: number, SentenceClass: typeof Sentence)
constructor (data: string | TaggedCorpus, typeOfCorpus: number, SentenceClass: typeof Sentence)
private readonly wordCount: number
private readonly sentences: Sentence[]
private readonly tagFrequencies: Record<string, Record<string, number> | undefined>
Expand All @@ -117,6 +118,15 @@ declare interface BrillPOSTaggedWord {
tag: string
}

declare interface BrillPOSTaggedSentence {
taggedWords: BrillPOSTaggedWord[]
}

declare interface TaggedCorpus {
wordCount: number
sentences: BrillPOSTaggedSentence[]
}

export class Sentence {
constructor (data?: string[])
taggedWords: BrillPOSTaggedWord[]
Expand All @@ -134,9 +144,6 @@ export class BrillPOSTagger {
}

export class BrillPOSTester {
constructor (lexicon: Lexicon, ruleSet: RuleSet)
private readonly lexicon: Lexicon
private readonly ruleSet: RuleSet
test (corpus: Corpus, tagger: BrillPOSTagger): [number, number]
}

Expand All @@ -161,7 +168,7 @@ export class BrillPOSTrainer {
private scanForPositiveRules (): void
private scanForSites (): void
private neighbourhood (i: number, j: number): Array<[number, number]>
train (corpus: Corpus, templates: RuleTemplates, lexicon: Lexicon): RuleSet
train (corpus: Corpus, templates: RuleTemplate[], lexicon: Lexicon): RuleSet
printRulesWithScores (): string
}

Expand Down
2 changes: 1 addition & 1 deletion lib/natural/brill_pos_tagger/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ exports.BrillPOSTrainer = require('./lib/Brill_POS_Trainer')
exports.BrillPOSTester = require('./lib/Brill_POS_Tester')
exports.Lexicon = require('./lib/Lexicon')
exports.RuleSet = require('./lib/RuleSet')
exports.RuleTemplates = require('./lib/RuleTemplates')
exports.ruleTemplates = require('./lib/RuleTemplates')
exports.RuleTemplate = require('./lib/RuleTemplate')
exports.Corpus = require('./lib/Corpus')
exports.Sentence = require('./lib/Sentence')
36 changes: 36 additions & 0 deletions ts_spec/aggressive_tokenizer_de_spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
Copyright (c) 2023, Hugo W.L. ter Doest
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

'use strict'

import { AggressiveTokenizerDe } from 'lib/natural'
const tokenizer = new AggressiveTokenizerDe()

describe('aggressive_tokenizer', function () {
it('should tokenize strings with diacritics ä, ö and ü, and esszet ß', function () {
expect(tokenizer.tokenize('Es werden nur Maßnahmen gefördert, die nicht aufgrund einer Rechtsvorschrift umgesetzt werden müssen.')).toEqual(
['Es', 'werden', 'nur', 'Maßnahmen', 'gefördert', 'die', 'nicht', 'aufgrund', 'einer',
'Rechtsvorschrift', 'umgesetzt', 'werden', 'müssen'])
expect(tokenizer.tokenize('Anträge sind vor Beginn der jeweiligen Maßnahme zu stellen.')).toEqual(
['Anträge', 'sind', 'vor', 'Beginn', 'der', 'jeweiligen', 'Maßnahme', 'zu', 'stellen'])
})
})
39 changes: 39 additions & 0 deletions ts_spec/aggressive_tokenizer_es_spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/*
Copyright (c) 2011, Chris Umbel
Permission is hereby granted, free of charge, to any person obtaining a copy
of this softwé and associated documentation files (the "Softwé"), to deal
in the Softwé without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Softwé, and to permit persons to whom the Softwé is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Softwé.
THE SOFTWé IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWé OR THE USE OR OTHER DEALINGS IN
THE SOFTWé.
*/

'use strict'

import { AggressiveTokenizerEs } from 'lib/natural'
const tokenizer = new AggressiveTokenizerEs()

describe('aggressive_tokenizer_es', function () {
it('should tokenize strings', function () {
expect(tokenizer.tokenize('hola yo me llamo eduardo y esudié ingeniería')).toEqual(['hola', 'yo', 'me', 'llamo', 'eduardo', 'y', 'esudié', 'ingeniería'])
})

/*
it('should tokenize strings via attached string method', function() {
tokenizer.attach();
expect('hola yo me llamo eduardo y esudié ingeniería'.tokenize()).toEqual(['hola', 'yo', 'me', 'llamo', 'eduardo', 'y', 'esudié', 'ingeniería']);
});
*/
})
111 changes: 111 additions & 0 deletions ts_spec/aggressive_tokenizer_fr_spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
/*
Copyright (c) 2011, Chris Umbel
Permission is hereby granted, free of charge, to any person obtaining a copy
of this softwé and associated documentation files (the "Softwé"), to deal
in the Softwé without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Softwé, and to permit persons to whom the Softwé is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Softwé.
THE SOFTWé IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWé OR THE USE OR OTHER DEALINGS IN
THE SOFTWé.
*/

'use strict'

import { AggressiveTokenizerFr } from 'lib/natural'
const tokenizer = new AggressiveTokenizerFr()

const text = "Affectueusement surnommé « Gabo » dans toute l'Amérique latine, le Colombien Gabriel Garcia Marquez, prix Nobel de littérature 1982, l'un des plus grands écrivains du XXe siècle, est mort À son domicile de Mexico jeudi 17 avril. Il était âgé de 87 ans. Son Œuvre a été traduite dans toutes les langues ou presque, et vendue à quelque 50 millions d'exemplaires."

const tokenized = ['Affectueusement',
'surnommé',
'Gabo',
'dans',
'toute',
'l',
'Amérique',
'latine',
'le',
'Colombien',
'Gabriel',
'Garcia',
'Marquez',
'prix',
'Nobel',
'de',
'littérature',
'1982',
'l',
'un',
'des',
'plus',
'grands',
'écrivains',
'du',
'XXe',
'siècle',
'est',
'mort',
'À',
'son',
'domicile',
'de',
'Mexico',
'jeudi',
'17',
'avril',
'Il',
'était',
'âgé',
'de',
'87',
'ans',
'Son',
'Œuvre',
'a',
'été',
'traduite',
'dans',
'toutes',
'les',
'langues',
'ou',
'presque',
'et',
'vendue',
'à',
'quelque',
'50',
'millions',
'd',
'exemplaires']

describe('aggressive_tokenizer_fr', function () {
it('should tokenize strings', function () {
expect(tokenizer.tokenize(text)).toEqual(tokenized)
})

it('should handle hyphens in words correctly', function () {
const sentence = 'Des sous-pages dans le sous-bois de la ville de Paris'
const res = tokenizer.tokenize(sentence)
const expectedRes = ['Des', 'sous-pages', 'dans', 'le', 'sous-bois', 'de', 'la', 'ville', 'de', 'Paris']
expect(res).toEqual(expectedRes)
})

/*
it('should tokenize strings via attached string method', function() {
tokenizer.attach();
expect(text.tokenize()).toEqual(tokenized);
});
*/
})
70 changes: 70 additions & 0 deletions ts_spec/aggressive_tokenizer_hi_spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*
file aggressive_tokenizer_hi_spec.js , located at spec\aggressive_tokenizer_hi_spec.js is licensed as follows:
- (The MIT License)
- Copyright (c) 2023 Mukesh Singh Bisht
Permission is hereby granted, free of charge, to any person or entity obtaining a copy
of file aggressive_tokenizer_hi_spec.js and its content(the "Software"), to deal in the
Software without restriction, including without limitation the rights to use, copy,
modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
and to permit persons to whom the Software is furnished to do so, subject to the following
conditions:
1. The above copyright notice and this permission notice shall be included in all copies
or substantial portions of the Software.
2. Proper credit must be given to the original author Mukesh Singh Bisht, along with the
date of authorship specified as July 23, 2023, in any usage, distribution, or
modification of the Software.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
PURPOSE, AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT, OR
OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/

'use strict'

import { AggressiveTokenizerHi } from 'lib/natural'
const tokenizer = new AggressiveTokenizerHi()

describe('aggressive_tokenizer_hi', function () {
it('should tokenize strings', function () {
const string = 'स्वतंत्रता दिवस की हार्दिक शुभकामनाएं'
const expectedArray = ['स्वतंत्रता', 'दिवस', 'की', 'हार्दिक', 'शुभकामनाएं']
expect(tokenizer.tokenize(string)).toEqual(expectedArray)
})
it('should tokenize strings including english words', function () {
const string = 'स्वतंत्रता दिवस की हार्दिक शुभकामनाएं congrats mukesh'
const expectedArray = ['स्वतंत्रता', 'दिवस', 'की', 'हार्दिक', 'शुभकामनाएं', 'congrats', 'mukesh']
expect(tokenizer.tokenize(string)).toEqual(expectedArray)
})

it('should swallow viram(stop) symbols', function () {
const string = 'स्वतंत्रता दिवस की हार्दिक शुभकामनाएं । congrats mukesh'
const expectedArray = ['स्वतंत्रता', 'दिवस', 'की', 'हार्दिक', 'शुभकामनाएं', 'congrats', 'mukesh']
expect(tokenizer.tokenize(string)).toEqual(expectedArray)
})
it('should swallow dirgh viram(stop) symbols', function () {
const string = 'राजा बाजीराव ॥ '
const expectedArray = ['राजा', 'बाजीराव']
expect(tokenizer.tokenize(string)).toEqual(expectedArray)
})
it('should swallow trailing off symbols', function () {
const string = 'राजा बाजीराव ...'
const expectedArray = ['राजा', 'बाजीराव']
expect(tokenizer.tokenize(string)).toEqual(expectedArray)
})
it('should swallow question mark symbols', function () {
const string = 'राजा बाजीराव ?'
const expectedArray = ['राजा', 'बाजीराव']
expect(tokenizer.tokenize(string)).toEqual(expectedArray)
})
it('should swallow comma mark symbols', function () {
const string = 'राजा, बाजीराव'
const expectedArray = ['राजा', 'बाजीराव']
expect(tokenizer.tokenize(string)).toEqual(expectedArray)
})
})
39 changes: 39 additions & 0 deletions ts_spec/aggressive_tokenizer_nl_spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/*
Copyright (c) 2011, Chris Umbel, Martijn de Boer
Permission is hereby granted, free of charge, to any person obtaining a copy
of this softwé and associated documentation files (the "Softwé"), to deal
in the Softwé without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Softwé, and to permit persons to whom the Softwé is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Softwé.
THE SOFTWé IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWé OR THE USE OR OTHER DEALINGS IN
THE SOFTWé.
*/

'use strict'

import { AggressiveTokenizerNl } from 'lib/natural'
const tokenizer = new AggressiveTokenizerNl()

describe('aggressive_tokenizer_nl', function () {
it('should tokenize strings', function () {
expect(tokenizer.tokenize('\'s Morgens is het nog erg koud, vertelde de weerman over een van de radio\'s')).toEqual(['\'s', 'Morgens', 'is', 'het', 'nog', 'erg', 'koud', 'vertelde', 'de', 'weerman', 'over', 'een', 'van', 'de', 'radio\'s'])
})

it('should handle hyphens in words correctly', function () {
const sentence = 'clearing-systeem front-office-automatisering christelijk-historisch mond-op-mond, kant-en-klaar, kruidje-roer-me-niet, doe-het-zelver'
const res = tokenizer.tokenize(sentence)
const expectedRes = ['clearing-systeem', 'front-office-automatisering', 'christelijk-historisch', 'mond-op-mond', 'kant-en-klaar', 'kruidje-roer-me-niet', 'doe-het-zelver']
expect(res).toEqual(expectedRes)
})
})
Loading

0 comments on commit c055e8f

Please sign in to comment.