Skip to content

Commit

Permalink
Merge pull request #31 from oduwsdl/stdin-ent-url
Browse files Browse the repository at this point in the history
Implementing support for stdin and url and user-supplied stopwords file
  • Loading branch information
anwala authored Mar 14, 2022
2 parents c584e7e + 714aa43 commit 3963338
Show file tree
Hide file tree
Showing 6 changed files with 185 additions and 170 deletions.
185 changes: 69 additions & 116 deletions README.md

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@
],
install_requires=[
'numpy',
'requests==2.22.0',
'sklearn==0.0'
'requests',
'sklearn',
'NwalaTextUtils==0.0.5'
],
entry_points={'console_scripts': ['sumgram = sumgram.sumgram:main']}
)
2 changes: 1 addition & 1 deletion sumgram/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.0.19'
__version__ = '1.0.1'
126 changes: 78 additions & 48 deletions sumgram/sumgram.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def fmt_posting(doc_dets):
def fmt_report(ngram_lst, params):

params['add_stopwords'] = list( params['add_stopwords'] )
params['add_stopwords_file'] = list( params['add_stopwords_file'] )

for i in range(len(ngram_lst)):

Expand Down Expand Up @@ -952,6 +953,7 @@ def rm_empty_and_stopword_ngrams(top_ngrams, k, stopwords):

if( match_flag is True ):
continue

#check if top_ngrams[i]['ngram'] has stopword, if so skip - end

final_top_ngrams.append( top_ngrams[i] )
Expand Down Expand Up @@ -1053,6 +1055,7 @@ def print_top_ngrams(n, top_ngrams, top_sumgram_count, params=None):

params.setdefault('ngram_printing_mw', 50)
params.setdefault('title', '')
doc_len = params.get('doc_len', None)
default_color = '49m'
tf_or_df = ''

Expand All @@ -1064,7 +1067,7 @@ def print_top_ngrams(n, top_ngrams, top_sumgram_count, params=None):
mw = params['ngram_printing_mw']
ngram_count = len(top_ngrams)

print('\nSummary for ' + str(ngram_count) + ' top sumgrams (base n: ' + str(n) + '): ')
print('\nSummary for {} top sumgrams (base n: {}, docs: {:,}):'.format(ngram_count, n, doc_len))
if( params['title'] != '' ):
print( params['title'])

Expand All @@ -1076,9 +1079,9 @@ def print_top_ngrams(n, top_ngrams, top_sumgram_count, params=None):


if( params['base_ngram_ansi_color'] == '' ):
print( '{:^6} {:<{mw}} {:^6} {:<7} {:<30}'.format('rank', 'sumgram', tf_or_df, tf_or_df + '-Rate', 'Base ngram', mw=mw))
print( '{:^6} {:^6} {:<7} {:<30} {:<{mw}}'.format('Rank', tf_or_df, tf_or_df + '-Rate', 'Base ngram', 'Sumgram', mw=mw))
else:
print( '{:^6} {:<{mw}} {:^6} {:<7} {:<30}'.format('rank', getColorTxt('sumgram', default_color), tf_or_df, tf_or_df + '-Rate', 'Base ngram', mw=mw))
print( '{:^6} {:^6} {:<7} {:<30} {:<{mw}}'.format('Rank', tf_or_df, tf_or_df + '-Rate', 'Base ngram', getColorTxt('Sumgram', default_color), mw=mw))

for i in range(top_sumgram_count):

Expand Down Expand Up @@ -1110,8 +1113,7 @@ def print_top_ngrams(n, top_ngrams, top_sumgram_count, params=None):
elif( params['base_ngram_ansi_color'] != '' ):
ngram_txt = getColorTxt(ngram_txt, default_color)


print( "{:^6} {:<{mw}} {:^6} {:^7} {:<30}".format(i+1, ngram_txt, ngram['term_freq'], "{:.2f}".format(ngram['term_rate']), base_ngram, mw=mw))
print( "{:^6} {:^6} {:^7} {:<30} {:<{mw}}".format(i+1, ngram['term_freq'], "{:.2f}".format(ngram['term_rate']), base_ngram, ngram_txt, mw=mw))

if( len(last_ngram) != 0 ):
if( params['min_df'] != 1 ):
Expand Down Expand Up @@ -1154,9 +1156,9 @@ def extract_top_ngrams(doc_lst, doc_dct_lst, n, params):
binary_tf_flag = True

bif_stopwords = bifurcate_stopwords( params['add_stopwords'] )
stopwords = getStopwordsSet() | bif_stopwords['unigrams']
min_df = params['min_df']
#print('min_df', min_df, type(min_df))
stopwords = bif_stopwords['unigrams'] | params['add_stopwords_file'] if params['no_default_stopwords'] is True else getStopwordsSet() | bif_stopwords['unigrams'] | params['add_stopwords_file']
min_df = params['min_df']

try:
if( isinstance(min_df, str) ):
if( min_df.find('.') == -1 ):
Expand All @@ -1166,8 +1168,8 @@ def extract_top_ngrams(doc_lst, doc_dct_lst, n, params):
except:
min_df = 1


params['min_df'] = min_df

count_vectorizer = CountVectorizer(stop_words=stopwords, token_pattern=params['token_pattern'], ngram_range=(n, n), binary=binary_tf_flag, min_df=min_df)

logger.debug('\tfit transfrom - start')
Expand All @@ -1183,7 +1185,14 @@ def extract_top_ngrams(doc_lst, doc_dct_lst, n, params):
logger.debug('\tfit transfrom - end')

#every entry in list top_ngrams is of type: (a, b), a: term, b: term position in TF matrix
top_ngrams = count_vectorizer.get_feature_names()
try:
top_ngrams = count_vectorizer.get_feature_names_out()
except AttributeError:
top_ngrams = count_vectorizer.get_feature_names()
except:
genericErrorInfo()
return []

filtered_top_ngrams = {}
total_freq = 0

Expand Down Expand Up @@ -1238,24 +1247,6 @@ def extract_top_ngrams(doc_lst, doc_dct_lst, n, params):

return filtered_top_ngrams

def get_user_stopwords(sep_stopwords, sep=','):

if( isinstance(sep_stopwords, str) ):

sep_stopwords = sep_stopwords.strip()
if( sep_stopwords == '' ):
return set()

add_stopwords = sep_stopwords.split(sep)
add_stopwords = set( [s.strip().lower() for s in add_stopwords] )
return add_stopwords

elif( isinstance(sep_stopwords, list) ):
#assumes user has already separated the stopwords
return set(sep_stopwords)
else:
return set()

def update_doc_indx(report, doc_id_new_doc_indx_map):

if( len(doc_id_new_doc_indx_map) == 0 ):
Expand Down Expand Up @@ -1283,6 +1274,13 @@ def update_doc_indx(report, doc_id_new_doc_indx_map):
if( doc_id in doc_id_new_doc_indx_map ):
report['top_sumgrams'][i][opt][j]['doc_indx'] = doc_id_new_doc_indx_map[doc_id]

def get_user_stopwords(add_stopwords):

all_stopwords = []
for s in add_stopwords:
all_stopwords += s['text'].split()

return all_stopwords

def get_top_sumgrams(doc_dct_lst, n=2, params=None):

Expand All @@ -1302,10 +1300,10 @@ def get_top_sumgrams(doc_dct_lst, n=2, params=None):
n = 1

params = get_default_args(params)
params.setdefault('stopwords_sep', ',')

params['state'] = {}
params['add_stopwords'] = get_user_stopwords( params['add_stopwords'], params['stopwords_sep'] )
params['doc_len'] = len(doc_dct_lst)
params['add_stopwords'] = set([ s.strip().lower() for s in params['add_stopwords'] if s.strip() != '' ])
params['add_stopwords_file'] = set([ s.strip().lower() for s in params['add_stopwords_file'] if s.strip() != '' ])
params.setdefault('binary_tf_flag', True)#Multiple occurrence of term T in a document counts as 1, TF = total number of times term appears in collection
nlp_addr = 'http://' + params['corenlp_host'] + ':' + params['corenlp_port']

Expand Down Expand Up @@ -1358,8 +1356,10 @@ def get_top_sumgrams(doc_dct_lst, n=2, params=None):

if( 'sentences' in doc_dct_lst[i] ):
del doc_dct_lst[i]['sentences']
#main algorithm step 1 - end

multi_word_proper_nouns = rank_proper_nouns(multi_word_proper_nouns)
#main algorithm step 1 - end


logger.debug('\tsentence segmentation - end')
logger.debug('\tshift: ' + str(params['shift']))
Expand Down Expand Up @@ -1428,6 +1428,7 @@ def get_top_sumgrams(doc_dct_lst, n=2, params=None):


top_ngrams = rm_empty_and_stopword_ngrams( top_ngrams, params['top_sumgram_count'] * 2, params['add_stopwords'] )

doc_id_new_doc_indx_map = {}
if( params['no_rank_docs'] == False ):
report['ranked_docs'], doc_id_new_doc_indx_map = get_ranked_docs( top_ngrams, doc_dct_lst )
Expand Down Expand Up @@ -1462,32 +1463,34 @@ def get_top_sumgrams(doc_dct_lst, n=2, params=None):
def get_args():

parser = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=30))
parser.add_argument('path', nargs='+', help='Folder path containing input documents or path to single file or multiple files')
parser.add_argument('path', nargs='*', help='File(s) or path to file(s) or URL(s)')

parser.add_argument('-d', '--print-details', help='Print detailed output', action='store_true')
parser.add_argument('-n', '--base-ngram', help='The base n (integer) for generating top sumgrams, if n = 2, bigrams would be the base ngram', type=int, default=2)
parser.add_argument('-m', '--max-ngram', help='The maximum length of sumgram generated', type=int, default=10)
parser.add_argument('-n', '--base-ngram', help='The base n (integer) for generating top sumgrams, if n = 2, bigrams would be the base ngram', type=int, default=2)
parser.add_argument('-o', '--output', help='Output file')
parser.add_argument('-s', '--sentences-rank-count', help='The count of top ranked sentences to generate', type=int, default=10)
parser.add_argument('-t', '--top-sumgram-count', help='The count of top sumgrams to generate', type=int, default=10)

parser.add_argument('--add-stopwords', help='Comma-separated list of additional stopwords. To change delimiter use --stopwords-sep', default='')
parser.add_argument('--add-stopwords', nargs='+', help='Single or multiple additional stopwords or path to stopwords file (one stopword per line)', default=[])
parser.add_argument('--boilerplate-rm-method', help='Method to apply for removing HTML boilerplate', choices=['boilerpy3.DefaultExtractor', 'boilerpy3.ArticleExtractor', 'boilerpy3.ArticleSentencesExtractor', 'boilerpy3.LargestContentExtractor', 'boilerpy3.CanolaExtractor', 'boilerpy3.KeepEverythingExtractor', 'boilerpy3.NumWordsRulesExtractor', 'nltk'], default='boilerpy3.ArticleExtractor')
parser.add_argument('--collocations-pattern', help='User-defined regex rule to extract collocations for pos_glue_split_ngrams', default='')
parser.add_argument('--corenlp-host', help='Stanford CoreNLP Server host (needed for decent sentence tokenizer)', default='localhost')
parser.add_argument('--corenlp-port', help='Stanford CoreNLP Server port (needed for decent sentence tokenizer)', default='9000')
parser.add_argument('--corenlp-max-sentence-words', help='Stanford CoreNLP maximum words per sentence', default=100)
parser.add_argument('--max-file-depth', help='When reading files recursively from directory stop at the specified path depth. 0 means no restriction', type=int, default=1)
parser.add_argument('--include-postings', help='Include inverted index of term document mappings', action='store_true')#default is false except not included, in which case it's true
parser.add_argument('--max-file-depth', help='When reading files recursively from directory stop at the specified path depth. 0 means no restriction', type=int, default=1)

parser.add_argument('--log-file', help='Log output filename', default='')
parser.add_argument('--log-format', help='Log print format, see: https://docs.python.org/3/howto/logging-cookbook.html', default='')
parser.add_argument('--log-level', help='Log level', choices=['critical', 'error', 'warning', 'info', 'debug', 'notset'], default='info')

parser.add_argument('--mvg-window-min-proper-noun-rate', help='Mininum rate threshold (larger, stricter) to consider a multi-word proper noun a candidate to replace an ngram', type=float, default=0.5)
parser.add_argument('--min-df', help='See min_df in https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html', default=0.01)
parser.add_argument('--mvg-window-min-proper-noun-rate', help='Mininum rate threshold (larger, stricter) to consider a multi-word proper noun a candidate to replace an ngram', type=float, default=0.5)
parser.add_argument('--ngram-printing-mw', help='Mininum width for printing ngrams', type=int, default=50)

parser.add_argument('--base-ngram-ansi-color', help='Highlight (color code format - XXm, e.g., 91m) base ngram when printing top ngrams, set to empty string to switch off color', default='91m')
parser.add_argument('--no-default-stopwords', help='Do not use default English stopwords list (default is False)', action='store_true')
parser.add_argument('--no-mvg-window-glue-split-ngrams', help='Do not glue split top ngrams with Moving Window method (default is False)', action='store_true')
parser.add_argument('--no-parent-sentences', help='Do not include sentences that mention top ngrams in top ngrams payload (default is False)', action='store_true')
parser.add_argument('--no-pos-glue-split-ngrams', help='Do not glue split top ngrams with POS method (default is False)', action='store_true')
Expand All @@ -1502,7 +1505,6 @@ def get_args():
parser.add_argument('--sentence-pattern', help='For sentence ranking: Regex string that specifies tokens for sentence tokenization', default='[.?!][ \n]|\n+')
parser.add_argument('--sentence-tokenizer', help='For sentence ranking: Method for segmenting sentences', choices=['ssplit', 'regex'], default='regex')
parser.add_argument('--shift', help='Factor to shift top ngram calculation', type=int, default=0)
parser.add_argument('--stopwords-sep', help='Delimiter of stopwords list, comma is default', default=',')
parser.add_argument('--token-pattern', help='Regex string that specifies tokens for document tokenization', default=r'(?u)\b[a-zA-Z\'\’-]+[a-zA-Z]+\b|\d+[.,]?\d*')
parser.add_argument('--title', help='Text label to be used as a heading when printing top sumgrams', default='')
parser.add_argument('--thread-count', help='Maximum number of threads to use for parallel operations like segmenting sentences', type=int, default=5)
Expand All @@ -1522,6 +1524,7 @@ def get_default_args(user_params):
user_params[val.dest] = val.default

del user_params['help']
user_params['add_stopwords_file'] = []
return user_params


Expand Down Expand Up @@ -1620,22 +1623,49 @@ def set_log_defaults(params):

def main():

if( len(sys.argv) > 1 ):
if( sys.argv[1] == '-v' or sys.argv[1] == '--version' ):

from sumgram import __version__
print(__version__)
return
if( len(sys.argv) > 1 and (sys.argv[1] == '-v' or sys.argv[1] == '--version') ):
from sumgram import __version__
print(__version__)
return

parser = get_args()

args = parser.parse_args()
params = vars(args)

if( len(sys.argv) == 1 ):
parser.print_help()
return

doc_lst = []
set_log_defaults(params)
set_logger_dets( params['log_dets'] )

if( len(sys.argv) > 1 and (sys.argv[-1] == '-') ):
try:
doc_lst = [{'text': line} for line in sys.stdin]
except:
genericErrorInfo()

doc_lst = readTextFromFilesRecursive(args.path, addDetails=True, maxDepth=params['max_file_depth'])
params['add_stopwords'] = params['add_stopwords'][:-1] if (len(params['add_stopwords']) != 0 and params['add_stopwords'][-1].strip() == '-') else params['add_stopwords']
else:
doc_lst = generic_txt_extrator(args.path, max_file_depth=params['max_file_depth'], boilerplate_rm_method=params['boilerplate_rm_method'])

'''
add_stopwords:
* unigrams in add_stopwords are used to complemented stopwords in getStopwordsSet() to build initial top n-ngram, see: CountVectorizer(stop_words=stopwords,...)
* n-ngrams in add_stopwords are used in removing sumgrams that have n-ngrams, see: rm_empty_and_stopword_ngrams()
'''
add_stopwords = []
params['add_stopwords_file'] = []
for st in generic_txt_extrator(params['add_stopwords']):

if( 'filename' in st ):
params['add_stopwords_file'].append(st)
else:
add_stopwords.append(st)

params['add_stopwords'] = get_user_stopwords(add_stopwords)
params['add_stopwords_file'] = get_user_stopwords(params['add_stopwords_file'])
params['referrer'] = 'main'
proc_req(doc_lst, params)

Expand All @@ -1644,29 +1674,29 @@ def main():
from sumgram.util import getColorTxt
from sumgram.util import getStopwordsSet
from sumgram.util import genericErrorInfo
from sumgram.util import generic_txt_extrator
from sumgram.util import isMatchInOrder
from sumgram.util import nlpIsServerOn
from sumgram.util import nlpSentenceAnnotate
from sumgram.util import nlpServerStartStop
from sumgram.util import overlapFor2Sets
from sumgram.util import parallelTask
from sumgram.util import phraseTokenizer
from sumgram.util import readTextFromFilesRecursive
from sumgram.util import rmStopwords
from sumgram.util import sortDctByKey
else:
from util import dumpJsonToFile
from util import getColorTxt
from util import getStopwordsSet
from util import genericErrorInfo
from util import generic_txt_extrator
from util import isMatchInOrder
from util import nlpIsServerOn
from util import nlpSentenceAnnotate
from util import nlpServerStartStop
from util import overlapFor2Sets
from util import parallelTask
from util import phraseTokenizer
from util import readTextFromFilesRecursive
from util import rmStopwords
from util import sortDctByKey

Expand Down
35 changes: 33 additions & 2 deletions sumgram/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from subprocess import check_output, CalledProcessError
from multiprocessing import Pool
from NwalaTextUtils.textutils import parallelGetTxtFrmURIs

logger = logging.getLogger('sumGram.sumgram')

Expand Down Expand Up @@ -117,7 +118,7 @@ def getStopwordsDict():
"enough": True,
"etc": True,
"etc.": True,
"even though": True,
"even": True,
"ever": True,
"every": True,
"everyone": True,
Expand Down Expand Up @@ -440,7 +441,6 @@ def readTextFromFilesRecursive(files, addDetails=True, curDepth=0, maxDepth=0):
for f in files:

f = f.strip()

if( f.endswith('.tar') or f.endswith('.tar.gz') ):
result += readTextFromTar(f, addDetails=addDetails)

Expand Down Expand Up @@ -474,6 +474,37 @@ def readTextFromFilesRecursive(files, addDetails=True, curDepth=0, maxDepth=0):
result += readTextFromFilesRecursive(secondLevelFiles, addDetails=addDetails, curDepth=curDepth+1, maxDepth=maxDepth)

return result

#max_file_depth means no restriction
def generic_txt_extrator(sources, max_file_depth=0, boilerplate_rm_method='boilerpy3.ArticleExtractor'):

urls = []
doc_lst = []
for txt_src in sources:

#txt_src can be a url, file, or raw text
txt_src = txt_src.strip()
if( txt_src == '' ):
continue

if( txt_src.startswith('http://') or txt_src.startswith('https://') ):
urls.append(txt_src)
continue

txt_file = readTextFromFilesRecursive([txt_src], addDetails=True, maxDepth=max_file_depth)
doc_lst += [{'text': txt_src}] if len(txt_file) == 0 else txt_file

if( len(urls) != 0 ):
logger.info('\nDereferencing {} URL(s) - start'.format(len(urls)))

plain_text = parallelGetTxtFrmURIs(urls, boilerplateRmMethod=boilerplate_rm_method)
for i in range(len(plain_text)):
plain_text[i]['text'] = plain_text[i]['title'] + ' ' + plain_text[i]['text']

doc_lst += plain_text
logger.info('Dereferencing {} URL(s) - end'.format(len(urls)))

return doc_lst
#nlp server - start

def nlpIsServerOn(addr='http://localhost:9000'):
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/test_sumgram.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_multiple_opts(self):
]
params = {
'top_sumgram_count': 10,
'add_stopwords': 'image',
'add_stopwords': ['image'],
'no_rank_docs': True,
'no_rank_sentences': True,
'title': 'Top sumgrams for Hurricane Harvey text collection'
Expand Down

0 comments on commit 3963338

Please sign in to comment.