Skip to content

Commit

Permalink
Fixes issue #689 (#705)
Browse files Browse the repository at this point in the history
* Fixed issue #689

* Fixed indentation

* Trailing whitespace
  • Loading branch information
Hugo-ter-Doest authored Nov 26, 2023
1 parent d41aca5 commit 1b830b1
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 2 deletions.
3 changes: 2 additions & 1 deletion lib/natural/tokenizers/sentence_tokenizer.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ util.inherits(SentenceTokenizer, Tokenizer)

SentenceTokenizer.prototype.tokenize = function (text) {
// break string up in to sentences based on punctation and quotation marks
let tokens = text.match(/(?<=\s+|^)["'‘“'"[({⟨]?(.*?[.?!…]|[^.?!…]+)(\s[.?!…])*["'’”'"\])}⟩]?(?=\s+|$)/g)
// let tokens = text.match(/(?<=\s+|^)["'‘“'"[({⟨]?.*?[.?!…](\s[.?!…])*["'’”'"\])}⟩]?(?=\s+|$)/g)
let tokens = text.match(/(?<=\s+|^)["'‘“'"[({⟨]?(.*?[.?!…]|.+)(\s[.?!…])*["'’”'"\])}⟩]?(?=\s+|$)/g)

DEBUG && console.log('SentenceTokenizer.tokenize: ' + tokens)

Expand Down
15 changes: 14 additions & 1 deletion spec/sentence_tokenizer_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -159,18 +159,31 @@ describe('sentence_tokenizer', function () {
])
})

it('should handle text with the ellipsis symbol … and it should handle last sentence without punctuation (issue #648)', function () {
it('Should handle text with the ellipsis symbol … (issue #648)', function () {
expect(
tokenizer.tokenize('We’re heading for a catastrophic global temperature rise… Fires are blazing from the Amazon to the Arctic.')
).toEqual([
'We’re heading for a catastrophic global temperature rise…',
'Fires are blazing from the Amazon to the Arctic.'
])
})
it('It should handle last sentence without punctuation (issue #648)', function () {
expect(
tokenizer.tokenize('We’re heading for a catastrophic global temperature rise. Fires are blazing from the Amazon to the Arctic')
).toEqual([
'We’re heading for a catastrophic global temperature rise.',
'Fires are blazing from the Amazon to the Arctic'
])
})
it('It should handle the example from issue #689 correctly', function () {
const testInput = `
This is some test content.
We're trying to figure out variations in versions of the package.
`.trim()
expect(tokenizer.tokenize(testInput)).toEqual([
'This is some test content.',
'We\'re trying to figure out variations in versions of the package.'
])
})
})

0 comments on commit 1b830b1

Please sign in to comment.