From 8524b24b73230e9d18d7194b9589a346052cefc3 Mon Sep 17 00:00:00 2001 From: anwala Date: Sun, 27 Feb 2022 14:42:09 -0500 Subject: [PATCH 01/11] implementing support for stdin raw text and url --- setup.py | 5 +-- sumgram/__init__.py | 2 +- sumgram/sumgram.py | 83 ++++++++++++++++++++++++--------------------- sumgram/util.py | 34 ++++++++++++++++++- 4 files changed, 82 insertions(+), 42 deletions(-) diff --git a/setup.py b/setup.py index e554d46..7eb8d25 100644 --- a/setup.py +++ b/setup.py @@ -23,8 +23,9 @@ ], install_requires=[ 'numpy', - 'requests==2.22.0', - 'sklearn==0.0' + 'requests', + 'sklearn', + 'NwalaTextUtils==0.0.5' ], entry_points={'console_scripts': ['sumgram = sumgram.sumgram:main']} ) diff --git a/sumgram/__init__.py b/sumgram/__init__.py index f12684f..1f356cc 100644 --- a/sumgram/__init__.py +++ b/sumgram/__init__.py @@ -1 +1 @@ -__version__ = '0.0.19' +__version__ = '1.0.0' diff --git a/sumgram/sumgram.py b/sumgram/sumgram.py index 4e971f9..6492f2a 100755 --- a/sumgram/sumgram.py +++ b/sumgram/sumgram.py @@ -1076,9 +1076,9 @@ def print_top_ngrams(n, top_ngrams, top_sumgram_count, params=None): if( params['base_ngram_ansi_color'] == '' ): - print( '{:^6} {:<{mw}} {:^6} {:<7} {:<30}'.format('rank', 'sumgram', tf_or_df, tf_or_df + '-Rate', 'Base ngram', mw=mw)) + print( '{:^6} {:^6} {:<7} {:<30} {:<{mw}}'.format('Rank', tf_or_df, tf_or_df + '-Rate', 'Base ngram', 'Sumgram', mw=mw)) else: - print( '{:^6} {:<{mw}} {:^6} {:<7} {:<30}'.format('rank', getColorTxt('sumgram', default_color), tf_or_df, tf_or_df + '-Rate', 'Base ngram', mw=mw)) + print( '{:^6} {:^6} {:<7} {:<30} {:<{mw}}'.format('Rank', tf_or_df, tf_or_df + '-Rate', 'Base ngram', getColorTxt('Sumgram', default_color), mw=mw)) for i in range(top_sumgram_count): @@ -1110,8 +1110,7 @@ def print_top_ngrams(n, top_ngrams, top_sumgram_count, params=None): elif( params['base_ngram_ansi_color'] != '' ): ngram_txt = getColorTxt(ngram_txt, default_color) - - print( "{:^6} {:<{mw}} {:^6} {:^7} {:<30}".format(i+1, ngram_txt, ngram['term_freq'], "{:.2f}".format(ngram['term_rate']), base_ngram, mw=mw)) + print( "{:^6} {:^6} {:^7} {:<30} {:<{mw}}".format(i+1, ngram['term_freq'], "{:.2f}".format(ngram['term_rate']), base_ngram, ngram_txt, mw=mw)) if( len(last_ngram) != 0 ): if( params['min_df'] != 1 ): @@ -1156,7 +1155,7 @@ def extract_top_ngrams(doc_lst, doc_dct_lst, n, params): bif_stopwords = bifurcate_stopwords( params['add_stopwords'] ) stopwords = getStopwordsSet() | bif_stopwords['unigrams'] min_df = params['min_df'] - #print('min_df', min_df, type(min_df)) + try: if( isinstance(min_df, str) ): if( min_df.find('.') == -1 ): @@ -1166,8 +1165,8 @@ def extract_top_ngrams(doc_lst, doc_dct_lst, n, params): except: min_df = 1 + params['min_df'] = min_df - count_vectorizer = CountVectorizer(stop_words=stopwords, token_pattern=params['token_pattern'], ngram_range=(n, n), binary=binary_tf_flag, min_df=min_df) logger.debug('\tfit transfrom - start') @@ -1183,7 +1182,14 @@ def extract_top_ngrams(doc_lst, doc_dct_lst, n, params): logger.debug('\tfit transfrom - end') #every entry in list top_ngrams is of type: (a, b), a: term, b: term position in TF matrix - top_ngrams = count_vectorizer.get_feature_names() + try: + top_ngrams = count_vectorizer.get_feature_names_out() + except AttributeError: + top_ngrams = count_vectorizer.get_feature_names() + except: + genericErrorInfo() + return [] + filtered_top_ngrams = {} total_freq = 0 @@ -1238,24 +1244,6 @@ def extract_top_ngrams(doc_lst, doc_dct_lst, n, params): return filtered_top_ngrams -def get_user_stopwords(sep_stopwords, sep=','): - - if( isinstance(sep_stopwords, str) ): - - sep_stopwords = sep_stopwords.strip() - if( sep_stopwords == '' ): - return set() - - add_stopwords = sep_stopwords.split(sep) - add_stopwords = set( [s.strip().lower() for s in add_stopwords] ) - return add_stopwords - - elif( isinstance(sep_stopwords, list) ): - #assumes user has already separated the stopwords - return set(sep_stopwords) - else: - return set() - def update_doc_indx(report, doc_id_new_doc_indx_map): if( len(doc_id_new_doc_indx_map) == 0 ): @@ -1283,6 +1271,13 @@ def update_doc_indx(report, doc_id_new_doc_indx_map): if( doc_id in doc_id_new_doc_indx_map ): report['top_sumgrams'][i][opt][j]['doc_indx'] = doc_id_new_doc_indx_map[doc_id] +def get_user_stopwords(add_stopwords): + + all_stopwords = [] + for s in add_stopwords: + all_stopwords += s['text'].split() + + return all_stopwords def get_top_sumgrams(doc_dct_lst, n=2, params=None): @@ -1302,10 +1297,8 @@ def get_top_sumgrams(doc_dct_lst, n=2, params=None): n = 1 params = get_default_args(params) - params.setdefault('stopwords_sep', ',') - params['state'] = {} - params['add_stopwords'] = get_user_stopwords( params['add_stopwords'], params['stopwords_sep'] ) + params['add_stopwords'] = set([ s.strip().lower() for s in params['add_stopwords'] if s.strip() != '' ]) params.setdefault('binary_tf_flag', True)#Multiple occurrence of term T in a document counts as 1, TF = total number of times term appears in collection nlp_addr = 'http://' + params['corenlp_host'] + ':' + params['corenlp_port'] @@ -1465,26 +1458,27 @@ def get_args(): parser.add_argument('path', nargs='+', help='Folder path containing input documents or path to single file or multiple files') parser.add_argument('-d', '--print-details', help='Print detailed output', action='store_true') - parser.add_argument('-n', '--base-ngram', help='The base n (integer) for generating top sumgrams, if n = 2, bigrams would be the base ngram', type=int, default=2) parser.add_argument('-m', '--max-ngram', help='The maximum length of sumgram generated', type=int, default=10) + parser.add_argument('-n', '--base-ngram', help='The base n (integer) for generating top sumgrams, if n = 2, bigrams would be the base ngram', type=int, default=2) parser.add_argument('-o', '--output', help='Output file') parser.add_argument('-s', '--sentences-rank-count', help='The count of top ranked sentences to generate', type=int, default=10) parser.add_argument('-t', '--top-sumgram-count', help='The count of top sumgrams to generate', type=int, default=10) - parser.add_argument('--add-stopwords', help='Comma-separated list of additional stopwords. To change delimiter use --stopwords-sep', default='') + parser.add_argument('--add-stopwords', nargs='+', help='Single or multiple additional stopwords', default=[]) + parser.add_argument('--boilerplate-rm-method', help='Method to apply for removing HTML boilerplate', choices=['boilerpy3.DefaultExtractor', 'boilerpy3.ArticleExtractor', 'boilerpy3.ArticleSentencesExtractor', 'boilerpy3.LargestContentExtractor', 'boilerpy3.CanolaExtractor', 'boilerpy3.KeepEverythingExtractor', 'boilerpy3.NumWordsRulesExtractor', 'nltk'], default='boilerpy3.ArticleExtractor') parser.add_argument('--collocations-pattern', help='User-defined regex rule to extract collocations for pos_glue_split_ngrams', default='') parser.add_argument('--corenlp-host', help='Stanford CoreNLP Server host (needed for decent sentence tokenizer)', default='localhost') parser.add_argument('--corenlp-port', help='Stanford CoreNLP Server port (needed for decent sentence tokenizer)', default='9000') parser.add_argument('--corenlp-max-sentence-words', help='Stanford CoreNLP maximum words per sentence', default=100) - parser.add_argument('--max-file-depth', help='When reading files recursively from directory stop at the specified path depth. 0 means no restriction', type=int, default=1) parser.add_argument('--include-postings', help='Include inverted index of term document mappings', action='store_true')#default is false except not included, in which case it's true + parser.add_argument('--max-file-depth', help='When reading files recursively from directory stop at the specified path depth. 0 means no restriction', type=int, default=1) parser.add_argument('--log-file', help='Log output filename', default='') parser.add_argument('--log-format', help='Log print format, see: https://docs.python.org/3/howto/logging-cookbook.html', default='') parser.add_argument('--log-level', help='Log level', choices=['critical', 'error', 'warning', 'info', 'debug', 'notset'], default='info') - parser.add_argument('--mvg-window-min-proper-noun-rate', help='Mininum rate threshold (larger, stricter) to consider a multi-word proper noun a candidate to replace an ngram', type=float, default=0.5) parser.add_argument('--min-df', help='See min_df in https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html', default=0.01) + parser.add_argument('--mvg-window-min-proper-noun-rate', help='Mininum rate threshold (larger, stricter) to consider a multi-word proper noun a candidate to replace an ngram', type=float, default=0.5) parser.add_argument('--ngram-printing-mw', help='Mininum width for printing ngrams', type=int, default=50) parser.add_argument('--base-ngram-ansi-color', help='Highlight (color code format - XXm, e.g., 91m) base ngram when printing top ngrams, set to empty string to switch off color', default='91m') @@ -1502,7 +1496,6 @@ def get_args(): parser.add_argument('--sentence-pattern', help='For sentence ranking: Regex string that specifies tokens for sentence tokenization', default='[.?!][ \n]|\n+') parser.add_argument('--sentence-tokenizer', help='For sentence ranking: Method for segmenting sentences', choices=['ssplit', 'regex'], default='regex') parser.add_argument('--shift', help='Factor to shift top ngram calculation', type=int, default=0) - parser.add_argument('--stopwords-sep', help='Delimiter of stopwords list, comma is default', default=',') parser.add_argument('--token-pattern', help='Regex string that specifies tokens for document tokenization', default=r'(?u)\b[a-zA-Z\'\’-]+[a-zA-Z]+\b|\d+[.,]?\d*') parser.add_argument('--title', help='Text label to be used as a heading when printing top sumgrams', default='') parser.add_argument('--thread-count', help='Maximum number of threads to use for parallel operations like segmenting sentences', type=int, default=5) @@ -1628,22 +1621,37 @@ def main(): return parser = get_args() - args = parser.parse_args() params = vars(args) set_log_defaults(params) set_logger_dets( params['log_dets'] ) - - doc_lst = readTextFromFilesRecursive(args.path, addDetails=True, maxDepth=params['max_file_depth']) + + doc_lst = generic_txt_extrator(args.path, max_file_depth=params['max_file_depth'], boilerplate_rm_method=params['boilerplate_rm_method']) + params['add_stopwords'] = get_user_stopwords( generic_txt_extrator(params['add_stopwords']) ) + + ''' + To do: + readme re-arrange + issues + + Done: + URL, Raw text, file + Fixed: sklearn, even stopword, get_feature_names() + Removed --stopwords-sep, nargs for add_stopwords, stopwords file + test in docker env + boilerplate rm other extractors + ''' params['referrer'] = 'main' proc_req(doc_lst, params) + if __name__ == 'sumgram.sumgram': from sumgram.util import dumpJsonToFile from sumgram.util import getColorTxt from sumgram.util import getStopwordsSet from sumgram.util import genericErrorInfo + from sumgram.util import generic_txt_extrator from sumgram.util import isMatchInOrder from sumgram.util import nlpIsServerOn from sumgram.util import nlpSentenceAnnotate @@ -1651,7 +1659,6 @@ def main(): from sumgram.util import overlapFor2Sets from sumgram.util import parallelTask from sumgram.util import phraseTokenizer - from sumgram.util import readTextFromFilesRecursive from sumgram.util import rmStopwords from sumgram.util import sortDctByKey else: @@ -1659,6 +1666,7 @@ def main(): from util import getColorTxt from util import getStopwordsSet from util import genericErrorInfo + from util import generic_txt_extrator from util import isMatchInOrder from util import nlpIsServerOn from util import nlpSentenceAnnotate @@ -1666,7 +1674,6 @@ def main(): from util import overlapFor2Sets from util import parallelTask from util import phraseTokenizer - from util import readTextFromFilesRecursive from util import rmStopwords from util import sortDctByKey diff --git a/sumgram/util.py b/sumgram/util.py index 03e4c69..ce1c8b8 100644 --- a/sumgram/util.py +++ b/sumgram/util.py @@ -9,6 +9,7 @@ from subprocess import check_output, CalledProcessError from multiprocessing import Pool +from NwalaTextUtils.textutils import parallelGetTxtFrmURIs logger = logging.getLogger('sumGram.sumgram') @@ -117,7 +118,7 @@ def getStopwordsDict(): "enough": True, "etc": True, "etc.": True, - "even though": True, + "even": True, "ever": True, "every": True, "everyone": True, @@ -474,6 +475,37 @@ def readTextFromFilesRecursive(files, addDetails=True, curDepth=0, maxDepth=0): result += readTextFromFilesRecursive(secondLevelFiles, addDetails=addDetails, curDepth=curDepth+1, maxDepth=maxDepth) return result + +#max_file_depth means no restriction +def generic_txt_extrator(sources, max_file_depth=0, boilerplate_rm_method='boilerpy3.ArticleExtractor'): + + urls = [] + doc_lst = [] + for txt_src in sources: + + #txt_src can be a url, file, or raw text + txt_src = txt_src.strip() + if( txt_src == '' ): + continue + + if( txt_src.startswith('http://') or txt_src.startswith('https://') ): + urls.append(txt_src) + continue + + txt_file = readTextFromFilesRecursive([txt_src], addDetails=True, maxDepth=max_file_depth) + doc_lst += [{'text': txt_src}] if len(txt_file) == 0 else txt_file + + if( len(urls) != 0 ): + logger.info('\nDereferencing {} URL(s) - start'.format(len(urls))) + + plain_text = parallelGetTxtFrmURIs(urls, boilerplateRmMethod=boilerplate_rm_method) + for i in range(len(plain_text)): + plain_text[i]['text'] = plain_text[i]['title'] + ' ' + plain_text[i]['text'] + + doc_lst += plain_text + logger.info('Dereferencing {} URL(s) - end'.format(len(urls))) + + return doc_lst #nlp server - start def nlpIsServerOn(addr='http://localhost:9000'): From 2a875fb0e53579486f92762d5e2153dd852507a0 Mon Sep 17 00:00:00 2001 From: anwala Date: Sun, 27 Feb 2022 14:42:54 -0500 Subject: [PATCH 02/11] implementing support for stdin raw text and url --- sumgram/sumgram.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/sumgram/sumgram.py b/sumgram/sumgram.py index 6492f2a..019b500 100755 --- a/sumgram/sumgram.py +++ b/sumgram/sumgram.py @@ -1630,18 +1630,6 @@ def main(): doc_lst = generic_txt_extrator(args.path, max_file_depth=params['max_file_depth'], boilerplate_rm_method=params['boilerplate_rm_method']) params['add_stopwords'] = get_user_stopwords( generic_txt_extrator(params['add_stopwords']) ) - ''' - To do: - readme re-arrange - issues - - Done: - URL, Raw text, file - Fixed: sklearn, even stopword, get_feature_names() - Removed --stopwords-sep, nargs for add_stopwords, stopwords file - test in docker env - boilerplate rm other extractors - ''' params['referrer'] = 'main' proc_req(doc_lst, params) From 51ae92dca44dfb8b1f5d02c24cf94dea81aa37ac Mon Sep 17 00:00:00 2001 From: anwala Date: Sun, 27 Feb 2022 19:04:55 -0500 Subject: [PATCH 03/11] implementing support for reading from stdin --- sumgram/sumgram.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/sumgram/sumgram.py b/sumgram/sumgram.py index 019b500..6e70c17 100755 --- a/sumgram/sumgram.py +++ b/sumgram/sumgram.py @@ -1455,7 +1455,7 @@ def get_top_sumgrams(doc_dct_lst, n=2, params=None): def get_args(): parser = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=30)) - parser.add_argument('path', nargs='+', help='Folder path containing input documents or path to single file or multiple files') + parser.add_argument('path', nargs='*', help='File(s) or path to file(s) or URL(s)') parser.add_argument('-d', '--print-details', help='Print detailed output', action='store_true') parser.add_argument('-m', '--max-ngram', help='The maximum length of sumgram generated', type=int, default=10) @@ -1624,12 +1624,21 @@ def main(): args = parser.parse_args() params = vars(args) + doc_lst = [] set_log_defaults(params) set_logger_dets( params['log_dets'] ) - doc_lst = generic_txt_extrator(args.path, max_file_depth=params['max_file_depth'], boilerplate_rm_method=params['boilerplate_rm_method']) - params['add_stopwords'] = get_user_stopwords( generic_txt_extrator(params['add_stopwords']) ) + if( len(args.path) == 0 ): + try: + fileobj = sys.stdin + with fileobj: + doc_lst = [{'text': fileobj.read()}] + except: + genericErrorInfo() + else: + doc_lst = generic_txt_extrator(args.path, max_file_depth=params['max_file_depth'], boilerplate_rm_method=params['boilerplate_rm_method']) + params['add_stopwords'] = get_user_stopwords( generic_txt_extrator(params['add_stopwords']) ) params['referrer'] = 'main' proc_req(doc_lst, params) From 0bfc38250bde622874facfd8df87f3253d1874cb Mon Sep 17 00:00:00 2001 From: anwala Date: Sun, 27 Feb 2022 19:14:38 -0500 Subject: [PATCH 04/11] implementing support for reading from stdin --- sumgram/sumgram.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/sumgram/sumgram.py b/sumgram/sumgram.py index 6e70c17..3e09bc3 100755 --- a/sumgram/sumgram.py +++ b/sumgram/sumgram.py @@ -1613,12 +1613,10 @@ def set_log_defaults(params): def main(): - if( len(sys.argv) > 1 ): - if( sys.argv[1] == '-v' or sys.argv[1] == '--version' ): - - from sumgram import __version__ - print(__version__) - return + if( len(sys.argv) > 1 and (sys.argv[1] == '-v' or sys.argv[1] == '--version') ): + from sumgram import __version__ + print(__version__) + return parser = get_args() args = parser.parse_args() @@ -1628,7 +1626,7 @@ def main(): set_log_defaults(params) set_logger_dets( params['log_dets'] ) - if( len(args.path) == 0 ): + if( len(sys.argv) > 1 and (sys.argv[1] == '-') ): try: fileobj = sys.stdin with fileobj: From 496a56a838352171d427f5e709a3dd0808bcc60e Mon Sep 17 00:00:00 2001 From: anwala Date: Sun, 27 Feb 2022 20:34:06 -0500 Subject: [PATCH 05/11] fixing tests and stdin/help conflict --- sumgram/sumgram.py | 8 ++++++-- sumgram/util.py | 3 +-- tests/unit/test_sumgram.py | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/sumgram/sumgram.py b/sumgram/sumgram.py index 3e09bc3..794c70a 100755 --- a/sumgram/sumgram.py +++ b/sumgram/sumgram.py @@ -1621,16 +1621,20 @@ def main(): parser = get_args() args = parser.parse_args() params = vars(args) + + if( len(sys.argv) == 1 ): + parser.print_help() + return doc_lst = [] set_log_defaults(params) set_logger_dets( params['log_dets'] ) - if( len(sys.argv) > 1 and (sys.argv[1] == '-') ): + if( len(sys.argv) > 1 and (sys.argv[-1] == '-') ): try: fileobj = sys.stdin with fileobj: - doc_lst = [{'text': fileobj.read()}] + doc_lst = [{'text': fileobj.read()}] except: genericErrorInfo() else: diff --git a/sumgram/util.py b/sumgram/util.py index ce1c8b8..e771659 100644 --- a/sumgram/util.py +++ b/sumgram/util.py @@ -492,8 +492,7 @@ def generic_txt_extrator(sources, max_file_depth=0, boilerplate_rm_method='boile urls.append(txt_src) continue - txt_file = readTextFromFilesRecursive([txt_src], addDetails=True, maxDepth=max_file_depth) - doc_lst += [{'text': txt_src}] if len(txt_file) == 0 else txt_file + doc_lst = readTextFromFilesRecursive([txt_src], addDetails=True, maxDepth=max_file_depth) if( len(urls) != 0 ): logger.info('\nDereferencing {} URL(s) - start'.format(len(urls))) diff --git a/tests/unit/test_sumgram.py b/tests/unit/test_sumgram.py index 42ae7ae..55ded5b 100644 --- a/tests/unit/test_sumgram.py +++ b/tests/unit/test_sumgram.py @@ -29,7 +29,7 @@ def test_multiple_opts(self): ] params = { 'top_sumgram_count': 10, - 'add_stopwords': 'image', + 'add_stopwords': ['image'], 'no_rank_docs': True, 'no_rank_sentences': True, 'title': 'Top sumgrams for Hurricane Harvey text collection' From b39a83adf4f334f228e816afbad6f48b5b2be6ac Mon Sep 17 00:00:00 2001 From: anwala Date: Sat, 12 Mar 2022 11:58:00 -0500 Subject: [PATCH 06/11] patching text extraction and improving reading from stdin --- sumgram/sumgram.py | 14 +++++++++----- sumgram/util.py | 4 ++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/sumgram/sumgram.py b/sumgram/sumgram.py index 794c70a..f09f556 100755 --- a/sumgram/sumgram.py +++ b/sumgram/sumgram.py @@ -1053,6 +1053,7 @@ def print_top_ngrams(n, top_ngrams, top_sumgram_count, params=None): params.setdefault('ngram_printing_mw', 50) params.setdefault('title', '') + doc_len = params.get('doc_len', None) default_color = '49m' tf_or_df = '' @@ -1064,7 +1065,7 @@ def print_top_ngrams(n, top_ngrams, top_sumgram_count, params=None): mw = params['ngram_printing_mw'] ngram_count = len(top_ngrams) - print('\nSummary for ' + str(ngram_count) + ' top sumgrams (base n: ' + str(n) + '): ') + print('\nSummary for {} top sumgrams (base n: {}, docs: {:,}):'.format(ngram_count, n, doc_len)) if( params['title'] != '' ): print( params['title']) @@ -1298,6 +1299,7 @@ def get_top_sumgrams(doc_dct_lst, n=2, params=None): params = get_default_args(params) params['state'] = {} + params['doc_len'] = len(doc_dct_lst) params['add_stopwords'] = set([ s.strip().lower() for s in params['add_stopwords'] if s.strip() != '' ]) params.setdefault('binary_tf_flag', True)#Multiple occurrence of term T in a document counts as 1, TF = total number of times term appears in collection nlp_addr = 'http://' + params['corenlp_host'] + ':' + params['corenlp_port'] @@ -1351,8 +1353,10 @@ def get_top_sumgrams(doc_dct_lst, n=2, params=None): if( 'sentences' in doc_dct_lst[i] ): del doc_dct_lst[i]['sentences'] - #main algorithm step 1 - end + multi_word_proper_nouns = rank_proper_nouns(multi_word_proper_nouns) + #main algorithm step 1 - end + logger.debug('\tsentence segmentation - end') logger.debug('\tshift: ' + str(params['shift'])) @@ -1632,11 +1636,11 @@ def main(): if( len(sys.argv) > 1 and (sys.argv[-1] == '-') ): try: - fileobj = sys.stdin - with fileobj: - doc_lst = [{'text': fileobj.read()}] + doc_lst = [{'text': line} for line in sys.stdin] except: genericErrorInfo() + + params['add_stopwords'] = params['add_stopwords'][:-1] if (len(params['add_stopwords']) != 0 and params['add_stopwords'][-1].strip() == '-') else params['add_stopwords'] else: doc_lst = generic_txt_extrator(args.path, max_file_depth=params['max_file_depth'], boilerplate_rm_method=params['boilerplate_rm_method']) diff --git a/sumgram/util.py b/sumgram/util.py index e771659..48d8c87 100644 --- a/sumgram/util.py +++ b/sumgram/util.py @@ -441,7 +441,6 @@ def readTextFromFilesRecursive(files, addDetails=True, curDepth=0, maxDepth=0): for f in files: f = f.strip() - if( f.endswith('.tar') or f.endswith('.tar.gz') ): result += readTextFromTar(f, addDetails=addDetails) @@ -492,7 +491,8 @@ def generic_txt_extrator(sources, max_file_depth=0, boilerplate_rm_method='boile urls.append(txt_src) continue - doc_lst = readTextFromFilesRecursive([txt_src], addDetails=True, maxDepth=max_file_depth) + txt_file = readTextFromFilesRecursive([txt_src], addDetails=True, maxDepth=max_file_depth) + doc_lst += [{'text': txt_src}] if len(txt_file) == 0 else txt_file if( len(urls) != 0 ): logger.info('\nDereferencing {} URL(s) - start'.format(len(urls))) From 5d0af63b8e8992b00bd89cb5062111f484d9eedc Mon Sep 17 00:00:00 2001 From: anwala Date: Sun, 13 Mar 2022 13:52:28 -0400 Subject: [PATCH 07/11] patching separating stopwords added via file and cmd line with --add-stopword, & implemeniting --no-default-stopwords --- sumgram/sumgram.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/sumgram/sumgram.py b/sumgram/sumgram.py index f09f556..49a1750 100755 --- a/sumgram/sumgram.py +++ b/sumgram/sumgram.py @@ -952,6 +952,7 @@ def rm_empty_and_stopword_ngrams(top_ngrams, k, stopwords): if( match_flag is True ): continue + #check if top_ngrams[i]['ngram'] has stopword, if so skip - end final_top_ngrams.append( top_ngrams[i] ) @@ -1154,9 +1155,9 @@ def extract_top_ngrams(doc_lst, doc_dct_lst, n, params): binary_tf_flag = True bif_stopwords = bifurcate_stopwords( params['add_stopwords'] ) - stopwords = getStopwordsSet() | bif_stopwords['unigrams'] - min_df = params['min_df'] - + stopwords = bif_stopwords['unigrams'] | params['add_stopwords_file'] if params['no_default_stopwords'] is True else getStopwordsSet() | bif_stopwords['unigrams'] | params['add_stopwords_file'] + min_df = params['min_df'] + try: if( isinstance(min_df, str) ): if( min_df.find('.') == -1 ): @@ -1301,6 +1302,7 @@ def get_top_sumgrams(doc_dct_lst, n=2, params=None): params['state'] = {} params['doc_len'] = len(doc_dct_lst) params['add_stopwords'] = set([ s.strip().lower() for s in params['add_stopwords'] if s.strip() != '' ]) + params['add_stopwords_file'] = set([ s.strip().lower() for s in params['add_stopwords_file'] if s.strip() != '' ]) params.setdefault('binary_tf_flag', True)#Multiple occurrence of term T in a document counts as 1, TF = total number of times term appears in collection nlp_addr = 'http://' + params['corenlp_host'] + ':' + params['corenlp_port'] @@ -1425,6 +1427,7 @@ def get_top_sumgrams(doc_dct_lst, n=2, params=None): top_ngrams = rm_empty_and_stopword_ngrams( top_ngrams, params['top_sumgram_count'] * 2, params['add_stopwords'] ) + doc_id_new_doc_indx_map = {} if( params['no_rank_docs'] == False ): report['ranked_docs'], doc_id_new_doc_indx_map = get_ranked_docs( top_ngrams, doc_dct_lst ) @@ -1486,6 +1489,7 @@ def get_args(): parser.add_argument('--ngram-printing-mw', help='Mininum width for printing ngrams', type=int, default=50) parser.add_argument('--base-ngram-ansi-color', help='Highlight (color code format - XXm, e.g., 91m) base ngram when printing top ngrams, set to empty string to switch off color', default='91m') + parser.add_argument('--no-default-stopwords', help='Do not use default English stopwords list (default is False)', action='store_true') parser.add_argument('--no-mvg-window-glue-split-ngrams', help='Do not glue split top ngrams with Moving Window method (default is False)', action='store_true') parser.add_argument('--no-parent-sentences', help='Do not include sentences that mention top ngrams in top ngrams payload (default is False)', action='store_true') parser.add_argument('--no-pos-glue-split-ngrams', help='Do not glue split top ngrams with POS method (default is False)', action='store_true') @@ -1644,10 +1648,24 @@ def main(): else: doc_lst = generic_txt_extrator(args.path, max_file_depth=params['max_file_depth'], boilerplate_rm_method=params['boilerplate_rm_method']) - params['add_stopwords'] = get_user_stopwords( generic_txt_extrator(params['add_stopwords']) ) + ''' + add_stopwords: + * unigrams in add_stopwords are used to complemented stopwords in getStopwordsSet() to build initial top n-ngram, see: CountVectorizer(stop_words=stopwords,...) + * n-ngrams in add_stopwords are used in removing sumgrams that have n-ngrams, see: rm_empty_and_stopword_ngrams() + ''' + add_stopwords = [] + params['add_stopwords_file'] = [] + for st in generic_txt_extrator(params['add_stopwords']): + + if( 'filename' in st ): + params['add_stopwords_file'].append(st) + else: + add_stopwords.append(st) + + params['add_stopwords'] = get_user_stopwords(add_stopwords) + params['add_stopwords_file'] = get_user_stopwords(params['add_stopwords_file']) params['referrer'] = 'main' proc_req(doc_lst, params) - if __name__ == 'sumgram.sumgram': from sumgram.util import dumpJsonToFile From a8f51b939f25e90761a7d6df107f1cacadadcc1d Mon Sep 17 00:00:00 2001 From: anwala Date: Mon, 14 Mar 2022 13:08:16 -0400 Subject: [PATCH 08/11] updating readme --- README.md | 180 +++++++++++++++++---------------------------- sumgram/sumgram.py | 2 +- 2 files changed, 68 insertions(+), 114 deletions(-) diff --git a/README.md b/README.md index 2898ed8..d0a6e1d 100644 --- a/README.md +++ b/README.md @@ -9,31 +9,6 @@ From Fig. 1, the six-gram `"centers for disease control and prevention"` was spl *Fig. 2: Comparison of top 20 (first column) bigrams, top 20 (second column) six-grams, and top 20 (third column) sumgrams (conjoined ngrams) generated by sumgram for a collection of documents about [Hurricane Harvey](https://en.wikipedia.org/wiki/Hurricane_Harvey). Proper nouns of more than two words (e.g., `"federal emergency management agency"`) are split when generating bigrams, sumgram strives to remedy this. Generating six-grams surfaces non-salient six-grams.* hurricane harvey ngrams vs sumgrams -## Counting Term Frequencies -It is important to note that because sumgram was designed to generate top ngrams (summary) in a collection of text documents, it uses document frequencies (if the collection has more than one document). For example, consider the document frequency of `"ebola virus"` in the following collection of three documents, -``` -Collection of 3 documents: - -Document 1: "ebola virus" occurs 50 times -Document 2: "ebola virus" occurs 15 times -Document 3: "ebola virus" occurs 5 times - -According to sumgram, the document frequency (DF) of "ebola virus" is 3 NOT 70 -``` -According to sumgram, the document frequency of the term `"ebola virus"` is 3 NOT 70! Since the goal is to summarize the collection, documents are given a single vote for a single term, so as not to favor any (e.g., long) document or any term (e.g., very popular term within a few documents). However, if the collection contains a single document, term frequencies are used: -``` -Collection of 1 document: -Document 1: "ebola virus" occurs 50 times -According to sumgram, the TF of "ebola virus" is 50 NOT 1 -``` -Irrespective of the case (single-document or multi-document collection) sumgram uses `term_freq` to count the frequency of terms. -## Additional Features -In addition to generating top sumgrams, sumgram ranks sentences and documents. -### Ranking documents (`--no-rank-docs` to switch off) -`get_ranked_docs()` ranks documents by giving credit to documents that have highly ranked terms in the ranked list of ngrams. A document's score is awarded by accumulating the points awarded by the position of terms in the ranked list of ngrams. Please note that documents without terms in ranked list of ngrams are not awarded points. Therefore, some documents may not be ranked because they performed poorly - did not have any term in the ranked list of ngrams. - -### Ranking sentences (`--no-rank-sentences` to switch off) -`rank_sents_frm_top_ranked_docs()` ranks sentences in the top ranked documents exclusively, and gives credit to sentences with a high average overlap between the sentence tokens and the tokens in the top ngrams. For all sentences in a top ranked documents, a sentence's score (average overlap) is measured by calculating the average overlap between the terms in the top ngrams and the given sentence. This accounts for how many different tokens in the top ngrams that are present in a sentence. ## Installation Just type @@ -54,51 +29,16 @@ $ cd ..; rm -rf sumgram; $ docker run --rm -it -v "$PWD":/data/ wsdl/sumgram ``` OR install/run from Dockerhub: coming soon -## Recommended Requirement and Performance Considerations -### Recommended Requirement -For the best results, we recommend [installing and running Stanford CoreNLP Server](https://ws-dl.blogspot.com/2018/03/2018-03-04-installing-stanford-corenlp.html) for two reasons. -First, the "pos" in [`pos_glue_split_ngrams`](#pos_glue_split_ngrams) stands for Parts Of Speech (POS). This algorithm needs a POS annotator in order to "glue" split ngrams, hence the need for Stanford CoreNLP server. However, if you do not install Stanford CoreNLP Server, sumgram is robust enough to attempt to glue split ngrams with the second algorithm [`mvg_window_glue_split_ngrams`](#mvg_window_glue_split_ngrams). - -Second, as part of ranking sentences, sumgram needs to segment the sentences in the documents. Stanford CoreNLP's [`ssplit`](https://stanfordnlp.github.io/CoreNLP/ssplit.html) annotator splits sentences after tokenization, and exploits the decisions of the tokenizer. Probabilitic methods (such as `ssplit`) for segmenting sentences often outperform rule-based methods that use regular expressions to define sentence boundaries. If you do not install Stanford CoreNLP however, sumgram will adopt a regular expression rule (`[.?!][ \n]|\n+`) to mark sentence boundaries. This rule can be passed (```--sentence-pattern``` - command line, ```sentence_pattern``` - python) as an argument to sumgram. - -### Performance Considerations - `ssplit` and Named Entity Recognition -`pos_glue_split_ngrams` imposes additional runtime overhead on sumgram. You may choose to force sumgram to avoid using the ssplit annotator (implicitly switching off `pos_glue_split_ngrams`) by setting `--sentence-tokenizer=regex` (Python: ```params['sentence_tokenizer'] = 'regex'```). Please note that the command line argument `--no-pos-glue-split-ngrams` does not switch off Stanford CoreNLP's ssplit, it merely avoids the use of the `pos_glue_split_ngrams`. - -We considered leveraging Stanford CoreNLP's [Named Entity Annotator](https://stanfordnlp.github.io/CoreNLP/ner.html) as a means to find additional multi-word proper nouns in order to conjoin split ngrams. With a Named Entity Recognition (NER) system, one could easily label a text collection with entity labels (e.g., `PERSON`, `LOCATION`, and `ORGANIZATION`), and instruct the ngram generator to avoid splitting ngrams that have those labels, as a means to remedy the split ngrams problem. However, we decided not to apply NER to resolve split ngrams because NER would impose additional performance overhead upon sumgram. It was important to keep sumgrams as lightweight as possible without compromising the quality of results. There are some phrases such as `"direct contact with"` and `"health care workers,"` that sumgram could generate unlike NER. However, NER unlike sumgrams provides the benefit of labeling ngrams (e.g, "CDC" - Organization) although with additional performance cost. Even though we recommend using sumgrams with the POS and `ssplit` annotators from the Stanford CoreNLP suite, and even though they impose additional overhead (while producing better conjoined ngrams and sentence segmentation), sumgrams does not require them to work, and empirical evaluation of sumgrams generated without POS or `ssplit` have been satisfactory. - -### Performance Considerations - size of output -By default `--no-parent-sentences` is switched off, this means that the sentences that mention the top sumgrams are included in the final dictionary output of sumgram (output of `get_top_sumgrams()`), thus increasing the size of the output. To avoid this, include the `--no-parent-sentences` option. - -### Performance Considerations - size of vocabulary (manipulating [`min_df`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)) -See [sklearn.feature_extraction.text.CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) for more details about `min_df`. - -Sumgram begins by counting the document frequencies (DF: number of documents that include a term) for each term (ngram) in the vocabulary. The size of the vocabulary significantly affects the runtime of sumgram. The larger the vocabulary, the longer the runtime, and vice versa. Fortunately, we can take advantage of the fact that sumgram favors ngrams with `DF > 1`, to remove terms that occur once or a "few" times. The challenge is that the definition of "few" is subjective. `min_df` defines what we consider as few: when building the vocabulary, terms with document frequencies less than `min_df` (`DF < min_df`) are removed. `min_df` influences the size of the vocabulary by eliminating terms with `DF < min_df`. See the following example about how `min_df` affects the size of the vocabulary. -``` -Document count: 20 -Previous vocabulary size: 9,252 -New vocabulary size: 9,252 -Last term DF rate: 0.0500 -Result: Vocabulary size does not shrink since last DF > min_df (0.05 > 0.01). Therefore, no term removed. - -Document count: 593 -Previous vocabulary size: 127,169 -New vocabulary size: 1,321 -Last term DF rate: 0.0101 -Result: Vocabulary size shrinks by ~98%, as a result 125,848 (with DF < 0.01) terms removed. -``` -These results suggest that if your collection consists of thousands of documents, you might need to increase the `min_df` (default is: 0.01) threshold. When sumgram is run from the command-line, the last line indicates the rank of the ngram with the least DF (last ngram). For example, the following line -``` -last ngram with min_df = 0.01 (index/DF/DF-Rate): release transcript (1321/6/0.0101) -``` -indicates that the last ngram ("release transcript" - 1,321st ngram) occurred in 6 documents (DF = 6, DF-Rate = 0.0101). So terms with `DF < 0.01` were discard from the vocabulary. Therefore, the user could increase `min_df` if a DF of 6 is still considered small. In contrast, the user could decrease `min_df` if a DF of 5 is considered big. ## Usage ### Basic usage: -* `$ sumgram path/to/collection/of/text/files/` +* `$ sumgram path/to/collection/of/text/files/`, e.g., sumgram [tests/unit/sample_cols/harvey](tests/unit/sample_cols/harvey) -* `$ sumgram single_file.txt` -eg. sumgram [tests/unit/sample_cols/harvey/single_file.txt](tests/unit/sample_cols/harvey/08803837d3fc3c13dd29d3181d7e9cb2.txt) -* `$ sumgram path/to/collection/ file2.txt file3.txt` +* `$ sumgram single_file.txt`, +e.g., sumgram [tests/unit/sample_cols/harvey/single_file.txt](tests/unit/sample_cols/harvey/08803837d3fc3c13dd29d3181d7e9cb2.txt) +* `$ sumgram https://www.example.com/news/article-1.html https://www.example.com/news/article-2.html` +* `$ sumgram path/to/collection/ file2.txt file3.txt https://www.example.com/news/article-1.html` +* `$ cat path/to/collection/of/text/files/*.txt | sumgram -` ### Python script usage: [Command line options](#full-usage) may be activated by setting the argument in the `params` dictionary passed as an argument to `get_top_sumgrams()`. To set a command line argument, consider the following transformation example: @@ -260,55 +200,69 @@ sumgram -t 20 -o harvey_sumgrams.json --pretty-print cols/harvey/ - **cur_pos_sequence** (array[string]): [Part of Speech labels](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html) of `cur_ngram` - **params** (objects) -### Full usage +## Counting Term Frequencies +It is important to note that because sumgram was designed to generate top ngrams (summary) in a collection of text documents, it uses document frequencies (if the collection has more than one document). For example, consider the document frequency of `"ebola virus"` in the following collection of three documents, +``` +Collection of 3 documents: + +Document 1: "ebola virus" occurs 50 times +Document 2: "ebola virus" occurs 15 times +Document 3: "ebola virus" occurs 5 times + +According to sumgram, the document frequency (DF) of "ebola virus" is 3 NOT 70 +``` +According to sumgram, the document frequency of the term `"ebola virus"` is 3 NOT 70! Since the goal is to summarize the collection, documents are given a single vote for a single term, so as not to favor any (e.g., long) document or any term (e.g., very popular term within a few documents). However, if the collection contains a single document, term frequencies are used: ``` -sumgram [options] path/to/collection/of/text/files/ - -Options: --n=2 The base n (int) for generating top sumgrams, if n = 2, bigrams become the base ngram - --d, --print-details Print details --m, --max-ngram=10 The maximum length of sumgram generated --o, --output Output file --s, --sentences-rank-count=10 The count of top ranked sentences to generate --t, --top-sumgram-count=10 The count of top sumgrams to generate - ---add-stopwords Comma-separated list of addition stopwords ---base-ngram-ansi-color='91m' Highlight (color code format - XXm, e.g., 91m) base ngram when printing top ngrams, set to empty string to switch off color ---collocations-pattern User-defined regex rule to extract collocations for pos_glue_split_ngrams ---corenlp-host=localhost Stanford CoreNLP Server host (needed for decent sentence tokenizer) ---corenlp-port=9000 Stanford CoreNLP Server port (needed for decent sentence tokenizer) ---corenlp-max-sentence-words=100 Stanford CoreNLP maximum words per sentence ---include-postings=False Include inverted index of term document mappings - ---log-file Log output filename ---log-format Log print format, see: https://docs.python.org/3/howto/logging-cookbook.html ---log-level=info Log level from OPTIONS: {critical, error, warning, info, debug, notset} - ---max-file-depth When reading files recursively from directory stop at the specified path depth. 0 means no restriction ---mvg-window-min-proper-noun-rate=0.5 Mininum rate threshold (larger, stricter) to consider a multi-word proper noun a candidate to replace an ngram ---min-df=0.01 See min_df in https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html ---ngram-printing-mw=50 Mininum width for printing ngrams - ---no-mvg-window-glue-split-ngrams=False Do not glue split top ngrams with Moving Window method (default is False) ---no-parent-sentences Do not include sentences that mention top ngrams in top ngrams payload (default is False) ---no-pos-glue-split-ngrams=False Do not glue split top ngrams with POS method (default is True) ---no-rank-docs=False Do not rank documents flag ---no-rank-sentences=False Do not rank sentences flag - ---parallel-readtext Read input files in parallel ---pos-glue-split-ngrams-coeff=0.5 Coeff. ([0, 1]) for permitting matched ngram replacement by pos_glue_split_ngrams(), bigger means stricter ---pretty-print=False Pretty print JSON output ---rm-subset-top-ngrams-coeff=0.5 Coeff. ([0, 1]) for permitting matched ngram replacement by rm_subset_top_ngrams(), bigger means stricter - ---sentence-pattern='[.?!][ \n]|\n+' For sentence ranking: Regex string that specifies tokens for sentence tokenization ---sentence-tokenizer=ssplit For sentence ranking: Method for segmenting sentences, options: {ssplit, regex} ---shift=0 Factor to shift top ngram calculation ---token-pattern Regex string that specifies tokens for document tokenization. Default = '\b[a-zA-Z\'\’-]+[a-zA-Z]+\b|\d+[.,]?\d*' ---title Text label to be used as a heading when printing top sumgrams ---thread-count=5 Maximum number of threads to use for parallel operations like segmenting sentences ---update-rate=50 Print 1 message per update-rate for long-running tasks +Collection of 1 document: +Document 1: "ebola virus" occurs 50 times +According to sumgram, the TF of "ebola virus" is 50 NOT 1 ``` +Irrespective of the case (single-document or multi-document collection) sumgram uses `term_freq` to count the frequency of terms. +## Additional Features +In addition to generating top sumgrams, sumgram ranks sentences and documents. +### Ranking documents (`--no-rank-docs` to switch off) +`get_ranked_docs()` ranks documents by giving credit to documents that have highly ranked terms in the ranked list of ngrams. A document's score is awarded by accumulating the points awarded by the position of terms in the ranked list of ngrams. Please note that documents without terms in ranked list of ngrams are not awarded points. Therefore, some documents may not be ranked because they performed poorly - did not have any term in the ranked list of ngrams. + +### Ranking sentences (`--no-rank-sentences` to switch off) +`rank_sents_frm_top_ranked_docs()` ranks sentences in the top ranked documents exclusively, and gives credit to sentences with a high average overlap between the sentence tokens and the tokens in the top ngrams. For all sentences in a top ranked documents, a sentence's score (average overlap) is measured by calculating the average overlap between the terms in the top ngrams and the given sentence. This accounts for how many different tokens in the top ngrams that are present in a sentence. + +## Recommended Requirement and Performance Considerations +### Recommended Requirement +For the best results, we recommend [installing and running Stanford CoreNLP Server](https://ws-dl.blogspot.com/2018/03/2018-03-04-installing-stanford-corenlp.html) for two reasons. +First, the "pos" in [`pos_glue_split_ngrams`](#pos_glue_split_ngrams) stands for Parts Of Speech (POS). This algorithm needs a POS annotator in order to "glue" split ngrams, hence the need for Stanford CoreNLP server. However, if you do not install Stanford CoreNLP Server, sumgram is robust enough to attempt to glue split ngrams with the second algorithm [`mvg_window_glue_split_ngrams`](#mvg_window_glue_split_ngrams). + +Second, as part of ranking sentences, sumgram needs to segment the sentences in the documents. Stanford CoreNLP's [`ssplit`](https://stanfordnlp.github.io/CoreNLP/ssplit.html) annotator splits sentences after tokenization, and exploits the decisions of the tokenizer. Probabilitic methods (such as `ssplit`) for segmenting sentences often outperform rule-based methods that use regular expressions to define sentence boundaries. If you do not install Stanford CoreNLP however, sumgram will adopt a regular expression rule (`[.?!][ \n]|\n+`) to mark sentence boundaries. This rule can be passed (```--sentence-pattern``` - command line, ```sentence_pattern``` - python) as an argument to sumgram. + +### Performance Considerations - `ssplit` and Named Entity Recognition +`pos_glue_split_ngrams` imposes additional runtime overhead on sumgram. You may choose to force sumgram to avoid using the ssplit annotator (implicitly switching off `pos_glue_split_ngrams`) by setting `--sentence-tokenizer=regex` (Python: ```params['sentence_tokenizer'] = 'regex'```). Please note that the command line argument `--no-pos-glue-split-ngrams` does not switch off Stanford CoreNLP's ssplit, it merely avoids the use of the `pos_glue_split_ngrams`. + +We considered leveraging Stanford CoreNLP's [Named Entity Annotator](https://stanfordnlp.github.io/CoreNLP/ner.html) as a means to find additional multi-word proper nouns in order to conjoin split ngrams. With a Named Entity Recognition (NER) system, one could easily label a text collection with entity labels (e.g., `PERSON`, `LOCATION`, and `ORGANIZATION`), and instruct the ngram generator to avoid splitting ngrams that have those labels, as a means to remedy the split ngrams problem. However, we decided not to apply NER to resolve split ngrams because NER would impose additional performance overhead upon sumgram. It was important to keep sumgrams as lightweight as possible without compromising the quality of results. There are some phrases such as `"direct contact with"` and `"health care workers,"` that sumgram could generate unlike NER. However, NER unlike sumgrams provides the benefit of labeling ngrams (e.g, "CDC" - Organization) although with additional performance cost. Even though we recommend using sumgrams with the POS and `ssplit` annotators from the Stanford CoreNLP suite, and even though they impose additional overhead (while producing better conjoined ngrams and sentence segmentation), sumgrams does not require them to work, and empirical evaluation of sumgrams generated without POS or `ssplit` have been satisfactory. + +### Performance Considerations - size of output +By default `--no-parent-sentences` is switched off, this means that the sentences that mention the top sumgrams are included in the final dictionary output of sumgram (output of `get_top_sumgrams()`), thus increasing the size of the output. To avoid this, include the `--no-parent-sentences` option. + +### Performance Considerations - size of vocabulary (manipulating [`min_df`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)) +See [sklearn.feature_extraction.text.CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) for more details about `min_df`. + +Sumgram begins by counting the document frequencies (DF: number of documents that include a term) for each term (ngram) in the vocabulary. The size of the vocabulary significantly affects the runtime of sumgram. The larger the vocabulary, the longer the runtime, and vice versa. Fortunately, we can take advantage of the fact that sumgram favors ngrams with `DF > 1`, to remove terms that occur once or a "few" times. The challenge is that the definition of "few" is subjective. `min_df` defines what we consider as few: when building the vocabulary, terms with document frequencies less than `min_df` (`DF < min_df`) are removed. `min_df` influences the size of the vocabulary by eliminating terms with `DF < min_df`. See the following example about how `min_df` affects the size of the vocabulary. +``` +Document count: 20 +Previous vocabulary size: 9,252 +New vocabulary size: 9,252 +Last term DF rate: 0.0500 +Result: Vocabulary size does not shrink since last DF > min_df (0.05 > 0.01). Therefore, no term removed. + +Document count: 593 +Previous vocabulary size: 127,169 +New vocabulary size: 1,321 +Last term DF rate: 0.0101 +Result: Vocabulary size shrinks by ~98%, as a result 125,848 (with DF < 0.01) terms removed. +``` +These results suggest that if your collection consists of thousands of documents, you might need to increase the `min_df` (default is: 0.01) threshold. When sumgram is run from the command-line, the last line indicates the rank of the ngram with the least DF (last ngram). For example, the following line +``` +last ngram with min_df = 0.01 (index/DF/DF-Rate): release transcript (1321/6/0.0101) +``` +indicates that the last ngram ("release transcript" - 1,321st ngram) occurred in 6 documents (DF = 6, DF-Rate = 0.0101). So terms with `DF < 0.01` were discard from the vocabulary. Therefore, the user could increase `min_df` if a DF of 6 is still considered small. In contrast, the user could decrease `min_df` if a DF of 5 is considered big. ### Algorithms for detecting and gluing split Multi-Word Proper Noun (MWPN) ngrams diff --git a/sumgram/sumgram.py b/sumgram/sumgram.py index 49a1750..6e5793b 100755 --- a/sumgram/sumgram.py +++ b/sumgram/sumgram.py @@ -1471,7 +1471,7 @@ def get_args(): parser.add_argument('-s', '--sentences-rank-count', help='The count of top ranked sentences to generate', type=int, default=10) parser.add_argument('-t', '--top-sumgram-count', help='The count of top sumgrams to generate', type=int, default=10) - parser.add_argument('--add-stopwords', nargs='+', help='Single or multiple additional stopwords', default=[]) + parser.add_argument('--add-stopwords', nargs='+', help='Single or multiple additional stopwords or path to stopwords file (one stopword per line)', default=[]) parser.add_argument('--boilerplate-rm-method', help='Method to apply for removing HTML boilerplate', choices=['boilerpy3.DefaultExtractor', 'boilerpy3.ArticleExtractor', 'boilerpy3.ArticleSentencesExtractor', 'boilerpy3.LargestContentExtractor', 'boilerpy3.CanolaExtractor', 'boilerpy3.KeepEverythingExtractor', 'boilerpy3.NumWordsRulesExtractor', 'nltk'], default='boilerpy3.ArticleExtractor') parser.add_argument('--collocations-pattern', help='User-defined regex rule to extract collocations for pos_glue_split_ngrams', default='') parser.add_argument('--corenlp-host', help='Stanford CoreNLP Server host (needed for decent sentence tokenizer)', default='localhost') From 1d7f60ef1b385704d9298cafc87780235026f6d3 Mon Sep 17 00:00:00 2001 From: anwala Date: Mon, 14 Mar 2022 13:30:17 -0400 Subject: [PATCH 09/11] patching key add_stopwords_file error when get_top_sumgrams() called from script --- sumgram/sumgram.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sumgram/sumgram.py b/sumgram/sumgram.py index 6e5793b..07a2b94 100755 --- a/sumgram/sumgram.py +++ b/sumgram/sumgram.py @@ -70,6 +70,7 @@ def fmt_posting(doc_dets): def fmt_report(ngram_lst, params): params['add_stopwords'] = list( params['add_stopwords'] ) + params['add_stopwords_file'] = list( params['add_stopwords_file'] ) for i in range(len(ngram_lst)): @@ -1523,6 +1524,7 @@ def get_default_args(user_params): user_params[val.dest] = val.default del user_params['help'] + user_params['add_stopwords_file'] = [] return user_params From 321920676dbb3856295bedc48ecd90c5173ff1dc Mon Sep 17 00:00:00 2001 From: anwala Date: Mon, 14 Mar 2022 13:33:20 -0400 Subject: [PATCH 10/11] upgrading to v1.0.1 --- sumgram/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sumgram/__init__.py b/sumgram/__init__.py index 1f356cc..cd7ca49 100644 --- a/sumgram/__init__.py +++ b/sumgram/__init__.py @@ -1 +1 @@ -__version__ = '1.0.0' +__version__ = '1.0.1' From 714aa436f526d30ab412316ef699a7ac2207a69b Mon Sep 17 00:00:00 2001 From: anwala Date: Mon, 14 Mar 2022 13:57:01 -0400 Subject: [PATCH 11/11] modifyingg add_stopwords usage in python script (comma-delimited string to list) --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d0a6e1d..13857d8 100644 --- a/README.md +++ b/README.md @@ -60,12 +60,11 @@ doc_lst = [ ] ''' - Use 'add_stopwords' to include additional stopwords not included in stopwords list (https://github.com/oduwsdl/sumgram/blob/0224fc9d54034a25e296dd1c43c09c76244fc3c2/sumgram/util.py#L31) - 'add_stopwords' expects a comma-separated string of stopwords, e.g., "image, photo, image of" + Use 'add_stopwords' to include list of additional stopwords not included in stopwords list (https://github.com/oduwsdl/sumgram/blob/0224fc9d54034a25e296dd1c43c09c76244fc3c2/sumgram/util.py#L31) ''' params = { 'top_sumgram_count': 10, - 'add_stopwords': 'image', + 'add_stopwords': ['image'], 'no_rank_sentences': True, 'title': 'Top sumgrams for Hurricane Harvey text collection' }