glossary is a JavaScript module that extracts keywords from text (aka "term extraction" or "auto tagging"). It takes a string of text and returns an array of terms that are relevant to the content:
var glossary = require("glossary");
var keywords = glossary.extract("Her cake shop is the best in the business");
console.log(keywords) // ["cake", "shop", "cake shop", "business"]
glossary
is standalone and uses part-of-speech analysis to extract the relevant terms.
Use blacklist
to remove unwanted terms from any extraction:
var glossary = require("glossary")({
blacklist: ["library", "script", "api", "function"]
});
var keywords = glossary.extract("JavaScript color conversion library");
console.log(keywords); // ["color", "conversion"]
Use regExs
to remove unwanted terms from any extraction:
var glossary = require("glossary")({
regExs: [/(^|\s)@(\w+)/g , /(^|\s)#(\w+)/g]
});
var keywords = glossary.extract("#JavaScript color conversion @library");
console.log(keywords); // ["color", "conversion"]
Use minFreq
to limit the terms to only those that occur with a certain frequency:
var glossary = require("glossary")({ minFreq: 2 });
var keywords = glossary.extract("Kasey's pears are the best pears in Canada");
console.log(keywords); // ["pears"]
Use collapse
to remove terms that are sub-terms of other terms:
var glossary = require("glossary")({ collapse: true });
var keywords = glossary.extract("The Middle East crisis is getting worse");
console.log(keywords); // ["Middle East crisis"]
Use verbose
to also get the count of each term:
var glossary = require("./glossary")({ verbose: true });
var keywords = glossary.extract("The pears from the farm are good");
console.log(keywords); // [ { word: 'pears', count: 1 }, { word: 'farm', count: 1 } ]
You can also pass options as a second parameter to the extract function.
var glossary = require("./glossary");
var keywords = glossary.extract("The pears from the farm are good",{ verbose: true });
console.log(keywords); // [ { word: 'pears', count: 1 }, { word: 'farm', count: 1 } ]
This is just a little extension it add language detection via node-language-detect and filters out some stopwords (from ranks.nl ) to improve non English results.
var keywords = require("./keywords");
var terms = keywords.extract("Die letzten paar Auswertungen der Evaluationen finden statt und werden eingearbeitet");
console.log(terms); // { terms: [ 'Auswertungen','Evaluationen','Die letzten paar Auswertungen der Evaluationen finden statt', 'eingearbeitet' ], language: 'german' }
//instead of [ 'Die', 'letzten', 'paar', 'Auswertungen', 'der', 'Evaluationen', 'finden', 'statt', 'Die letzten paar Auswertungen der Evaluationen finden statt', 'werden', 'eingearbeitet' ]
glossary
Uses jspos for POS tagging. It's inspired by the python module topia.termextract.