diff --git a/lib/natural/tokenizers/sentence_tokenizer.js b/lib/natural/tokenizers/sentence_tokenizer.js index 1aaa7773f..80ec831c6 100644 --- a/lib/natural/tokenizers/sentence_tokenizer.js +++ b/lib/natural/tokenizers/sentence_tokenizer.js @@ -34,7 +34,8 @@ util.inherits(SentenceTokenizer, Tokenizer) SentenceTokenizer.prototype.tokenize = function (text) { // break string up in to sentences based on punctation and quotation marks - let tokens = text.match(/(?<=\s+|^)["'‘“'"[({⟨]?(.*?[.?!…]|[^.?!…]+)(\s[.?!…])*["'’”'"\])}⟩]?(?=\s+|$)/g) + // let tokens = text.match(/(?<=\s+|^)["'‘“'"[({⟨]?.*?[.?!…](\s[.?!…])*["'’”'"\])}⟩]?(?=\s+|$)/g) + let tokens = text.match(/(?<=\s+|^)["'‘“'"[({⟨]?(.*?[.?!…]|.+)(\s[.?!…])*["'’”'"\])}⟩]?(?=\s+|$)/g) DEBUG && console.log('SentenceTokenizer.tokenize: ' + tokens) diff --git a/spec/sentence_tokenizer_spec.js b/spec/sentence_tokenizer_spec.js index 330743f1b..1508c84f7 100644 --- a/spec/sentence_tokenizer_spec.js +++ b/spec/sentence_tokenizer_spec.js @@ -159,13 +159,15 @@ describe('sentence_tokenizer', function () { ]) }) - it('should handle text with the ellipsis symbol … and it should handle last sentence without punctuation (issue #648)', function () { + it('Should handle text with the ellipsis symbol … (issue #648)', function () { expect( tokenizer.tokenize('We’re heading for a catastrophic global temperature rise… Fires are blazing from the Amazon to the Arctic.') ).toEqual([ 'We’re heading for a catastrophic global temperature rise…', 'Fires are blazing from the Amazon to the Arctic.' ]) + }) + it('It should handle last sentence without punctuation (issue #648)', function () { expect( tokenizer.tokenize('We’re heading for a catastrophic global temperature rise. Fires are blazing from the Amazon to the Arctic') ).toEqual([ @@ -173,4 +175,15 @@ describe('sentence_tokenizer', function () { 'Fires are blazing from the Amazon to the Arctic' ]) }) + it('It should handle the example from issue #689 correctly', function () { + const testInput = ` + This is some test content. + + We're trying to figure out variations in versions of the package. + `.trim() + expect(tokenizer.tokenize(testInput)).toEqual([ + 'This is some test content.', + 'We\'re trying to figure out variations in versions of the package.' + ]) + }) })