From f1f1cfb35a9b723d45127236988f0e32932cbe3f Mon Sep 17 00:00:00 2001 From: Ankush-Chander Date: Sat, 4 Jun 2022 11:22:35 +0530 Subject: [PATCH 1/2] make biasedrank case agnostic. --- pytextrank/biasedrank.py | 4 ++-- tests/test_biasedrank.py | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/pytextrank/biasedrank.py b/pytextrank/biasedrank.py index 0135882..b6fa2f3 100644 --- a/pytextrank/biasedrank.py +++ b/pytextrank/biasedrank.py @@ -76,10 +76,10 @@ def _get_node_bias ( returns: bias to apply for the *node weight* """ - if token.text in self.focus_tokens: + if token.text.lower() in self.focus_tokens: return self.node_bias - if token.lemma_ in self.focus_tokens: + if token.lemma_.lower() in self.focus_tokens: return self.node_bias return self.default_bias diff --git a/tests/test_biasedrank.py b/tests/test_biasedrank.py index f6ee60d..f7785a7 100644 --- a/tests/test_biasedrank.py +++ b/tests/test_biasedrank.py @@ -31,6 +31,31 @@ def test_default_biased_rank (doc: Doc): assert tuple(p.text for p in phrases) == tuple(p.text for p in comparison_phrases) +def test_focus_biased_rank (doc: Doc): + """ +Biasedrank should lead to different results from base textrank when focus is provided. + """ + # given + biased_rank = BiasedTextRankFactory() + base_text_rank = BaseTextRankFactory() + + # when + processed_doc = base_text_rank(doc) + phrases = processed_doc._.phrases + + comparison_doc = biased_rank(doc) + tr = comparison_doc._.textrank + tr.change_focus( + "Manchester United", + bias=10.0, + default_bias=0.0) + comparison_phrases = comparison_doc._.phrases + + # then + assert tuple(p.text for p in phrases[:3]) != tuple(p.text for p in comparison_phrases[:3]) + + + def test_biased_rank (long_doc: Doc): """ Rank phrases close to 'focus' higher. From 8c2f47ff09f01741bd0e7a8c44c1336f2e3bb16d Mon Sep 17 00:00:00 2001 From: Ankush-Chander Date: Sat, 4 Jun 2022 12:03:10 +0530 Subject: [PATCH 2/2] add documentation for biased textrank. --- examples/sample.ipynb | 2213 +++++++++++++++++++++-------------------- 1 file changed, 1153 insertions(+), 1060 deletions(-) diff --git a/examples/sample.ipynb b/examples/sample.ipynb index 849bed8..70ed68c 100644 --- a/examples/sample.ipynb +++ b/examples/sample.ipynb @@ -97,8 +97,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[1m\n", - "============================= Pipeline Overview =============================\u001b[0m\n", + "\u001B[1m\n", + "============================= Pipeline Overview =============================\u001B[0m\n", "\n", "# Component Assigns Requires Scores Retokenizes\n", "- --------------- ------------------- -------- ---------------- -----------\n", @@ -124,7 +124,7 @@ " \n", "6 textrank False \n", "\n", - "\u001b[38;5;2m✔ No problems found.\u001b[0m\n" + "\u001B[38;5;2m✔ No problems found.\u001B[0m\n" ] }, { @@ -173,16 +173,16 @@ " 'lemmatizer': [],\n", " 'ner': [],\n", " 'textrank': []},\n", - " 'attrs': {'token.ent_iob': {'assigns': ['ner'], 'requires': []},\n", + " 'attrs': {'token.dep': {'assigns': ['parser'], 'requires': []},\n", " 'token.ent_type': {'assigns': ['ner'], 'requires': []},\n", - " 'token.tag': {'assigns': ['tagger'], 'requires': []},\n", - " 'token.dep': {'assigns': ['parser'], 'requires': []},\n", " 'token.head': {'assigns': ['parser'], 'requires': []},\n", - " 'doc.sents': {'assigns': ['parser'], 'requires': []},\n", - " 'doc.ents': {'assigns': ['ner'], 'requires': []},\n", + " 'token.is_sent_start': {'assigns': ['parser'], 'requires': []},\n", + " 'token.ent_iob': {'assigns': ['ner'], 'requires': []},\n", " 'doc.tensor': {'assigns': ['tok2vec'], 'requires': []},\n", + " 'doc.ents': {'assigns': ['ner'], 'requires': []},\n", + " 'doc.sents': {'assigns': ['parser'], 'requires': []},\n", " 'token.lemma': {'assigns': ['lemmatizer'], 'requires': []},\n", - " 'token.is_sent_start': {'assigns': ['parser'], 'requires': []}}}" + " 'token.tag': {'assigns': ['tagger'], 'requires': []}}}" ] }, "execution_count": 4, @@ -276,7 +276,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "ic| tr.elapsed_time: 5.707979202270508\n" + "ic| tr.elapsed_time: 7.581949234008789\n" ] } ], @@ -306,103 +306,99 @@ "name": "stderr", "output_type": "stream", "text": [ - "ic| phrase.rank: 0.17054248030845812\n", + "ic| phrase.rank: 0.18359439311764025\n", " phrase.count: 1\n", " phrase.text: 'mixed types'\n", "ic| phrase.chunks: [mixed types]\n", - "ic| phrase.rank: 0.15757771579579002\n", - " phrase.count: 1\n", - " phrase.text: 'minimal generating sets'\n", - "ic| phrase.chunks: [minimal generating sets]\n", - "ic| phrase.rank: 0.1573942320091846\n", + "ic| phrase.rank: 0.1784796193107821\n", " phrase.count: 3\n", " phrase.text: 'systems'\n", "ic| phrase.chunks: [systems, systems, systems]\n", - "ic| phrase.rank: 0.14894241299658317\n", + "ic| phrase.rank: 0.15037838042245094\n", + " phrase.count: 1\n", + " phrase.text: 'minimal generating sets'\n", + "ic| phrase.chunks: [minimal generating sets]\n", + "ic| phrase.rank: 0.14740065982407313\n", " phrase.count: 1\n", " phrase.text: 'nonstrict inequations'\n", "ic| phrase.chunks: [nonstrict inequations]\n", - "ic| phrase.rank: 0.14039169904589088\n", + "ic| phrase.rank: 0.13946027725597837\n", " phrase.count: 1\n", " phrase.text: 'strict inequations'\n", "ic| phrase.chunks: [strict inequations]\n", - "ic| phrase.rank: 0.11698198658021898\n", - " phrase.count: 1\n", - " phrase.text: 'natural numbers'\n", - "ic| phrase.chunks: [natural numbers]\n", - "ic| phrase.rank: 0.11559770516796158\n", + "ic| phrase.rank: 0.1195023546245721\n", " phrase.count: 1\n", " phrase.text: 'linear Diophantine equations'\n", "ic| phrase.chunks: [linear Diophantine equations]\n", - "ic| phrase.rank: 0.11407086615794945\n", + "ic| phrase.rank: 0.11450088293222845\n", + " phrase.count: 1\n", + " phrase.text: 'natural numbers'\n", + "ic| phrase.chunks: [natural numbers]\n", + "ic| phrase.rank: 0.10780718173686318\n", " phrase.count: 3\n", " phrase.text: 'solutions'\n", "ic| phrase.chunks: [solutions, solutions, solutions]\n", - "ic| phrase.rank: 0.10165710454752863\n", + "ic| phrase.rank: 0.10529828014583348\n", " phrase.count: 1\n", " phrase.text: 'linear constraints'\n", "ic| phrase.chunks: [linear constraints]\n", - "ic| phrase.rank: 0.09237587396226833\n", + "ic| phrase.rank: 0.1036960590708142\n", + " phrase.count: 1\n", + " phrase.text: 'all the considered types systems'\n", + "ic| phrase.chunks: [all the considered types systems]\n", + "ic| phrase.rank: 0.08812713074893187\n", " phrase.count: 1\n", " phrase.text: 'a minimal supporting set'\n", "ic| phrase.chunks: [a minimal supporting set]\n", - "ic| phrase.rank: 0.08845296671843554\n", + "ic| phrase.rank: 0.08243620500315359\n", " phrase.count: 1\n", - " phrase.text: 'all the considered types systems'\n", - "ic| phrase.chunks: [all the considered types systems]\n", - "ic| phrase.rank: 0.08294839224739124\n", + " phrase.text: 'a system'\n", + "ic| phrase.chunks: [a system]\n", + "ic| phrase.rank: 0.07944607954086784\n", " phrase.count: 1\n", " phrase.text: 'a minimal set'\n", "ic| phrase.chunks: [a minimal set]\n", - "ic| phrase.rank: 0.08107274369298882\n", + "ic| phrase.rank: 0.0763527926213032\n", " phrase.count: 1\n", " phrase.text: 'algorithms'\n", "ic| phrase.chunks: [algorithms]\n", - "ic| phrase.rank: 0.07429406639612553\n", - " phrase.count: 1\n", - " phrase.text: 'construction'\n", - "ic| phrase.chunks: [construction]\n", - "ic| phrase.rank: 0.07269728177551771\n", + "ic| phrase.rank: 0.07593126037016427\n", " phrase.count: 1\n", - " phrase.text: 'a system'\n", - "ic| phrase.chunks: [a system]\n", - "ic| phrase.rank: 0.07130948853545689\n", + " phrase.text: 'all types'\n", + "ic| phrase.chunks: [all types]\n", + "ic| phrase.rank: 0.07309361902551355\n", " phrase.count: 1\n", " phrase.text: 'Diophantine'\n", "ic| phrase.chunks: [Diophantine]\n", - "ic| phrase.rank: 0.07034880604533804\n", - " phrase.count: 1\n", - " phrase.text: 'all types'\n", - "ic| phrase.chunks: [all types]\n", - "ic| phrase.rank: 0.06480303503167001\n", + "ic| phrase.rank: 0.0702090100898443\n", " phrase.count: 1\n", - " phrase.text: 'Upper bounds'\n", - "ic| phrase.chunks: [Upper bounds]\n", - "ic| phrase.rank: 0.05969087234318076\n", + " phrase.text: 'construction'\n", + "ic| phrase.chunks: [construction]\n", + "ic| phrase.rank: 0.05800111772673988\n", " phrase.count: 1\n", " phrase.text: 'the set'\n", "ic| phrase.chunks: [the set]\n", - "ic| phrase.rank: 0.05837512270115124\n", + "ic| phrase.rank: 0.054251394765316464\n", " phrase.count: 1\n", " phrase.text: 'components'\n", "ic| phrase.chunks: [components]\n", - "ic| phrase.rank: 0.048602276273752514\n", + "ic| phrase.rank: 0.04516904342912139\n", " phrase.count: 1\n", " phrase.text: 'Compatibility'\n", "ic| phrase.chunks: [Compatibility]\n", - "ic| phrase.rank: 0.048602276273752514\n", + "ic| phrase.rank: 0.04516904342912139\n", " phrase.count: 1\n", " phrase.text: 'compatibility'\n", "ic| phrase.chunks: [compatibility]\n", - "ic| phrase.rank: 0.0472624878442175\n", + "ic| phrase.rank: 0.04435648606848154\n", " phrase.count: 1\n", " phrase.text: 'the corresponding algorithms'\n", "ic| phrase.chunks: [the corresponding algorithms]\n", - "ic| phrase.rank: 0.04548690742119631\n", + "ic| phrase.rank: 0.042273783712246285\n", " phrase.count: 1\n", " phrase.text: 'Criteria'\n", "ic| phrase.chunks: [Criteria]\n", - "ic| phrase.rank: 0.021009502595385022\n", + "ic| phrase.rank: 0.01952542432474353\n", " phrase.count: 1\n", " phrase.text: 'These criteria'\n", "ic| phrase.chunks: [These criteria]\n" @@ -418,7 +414,11 @@ { "cell_type": "markdown", "id": "medium-vertex", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "## Stop Words\n", "\n", @@ -430,6 +430,9 @@ "execution_count": 9, "id": "separated-mambo", "metadata": { + "pycharm": { + "name": "#%%\n" + }, "scrolled": true, "tags": [] }, @@ -438,16 +441,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "ic| phrase: Phrase(text='words', chunks=[words, words], count=2, rank=0.15746606699141763)\n", - "ic| phrase: Phrase(text='sentences', chunks=[sentences], count=1, rank=0.12965916420829138)\n", - "ic| phrase: Phrase(text='Mihalcea et al', chunks=[Mihalcea et al], count=1, rank=0.10571655249620954)\n", - "ic| phrase: Phrase(text='the remaining words', chunks=[the remaining words], count=1, rank=0.09329379463860477)\n", - "ic| phrase: Phrase(text='gensim implements TextRank', chunks=[gensim implements TextRank], count=1, rank=0.08981955768260336)\n", - "ic| phrase: Phrase(text='text summarization', chunks=[text summarization], count=1, rank=0.0843351188899575)\n", - "ic| phrase: Phrase(text='ranking webpages', chunks=[ranking webpages], count=1, rank=0.07936404910104827)\n", - "ic| phrase: Phrase(text='Okapi BM25 function', chunks=[Okapi BM25 function], count=1, rank=0.07400094270083186)\n", - "ic| phrase: Phrase(text='every other sentence', chunks=[every other sentence], count=1, rank=0.07073416034725326)\n", - "ic| phrase: Phrase(text='original TextRank', chunks=[original TextRank], count=1, rank=0.06710956557420322)\n" + "ic| phrase: Phrase(text='words', chunks=[words, words], count=2, rank=0.16137018222637944)\n", + "ic| phrase: Phrase(text='sentences', chunks=[sentences], count=1, rank=0.13367291641220508)\n", + "ic| phrase: Phrase(text='Mihalcea et al', chunks=[Mihalcea et al], count=1, rank=0.1095023226326187)\n", + "ic| phrase: Phrase(text='et al', chunks=[et al], count=1, rank=0.10745197034799042)\n", + "ic| phrase: Phrase(text='Barrios et al', chunks=[Barrios et al], count=1, rank=0.10502825160040344)\n", + "ic| phrase: Phrase(text='the remaining words', chunks=[the remaining words], count=1, rank=0.09559863808781449)\n", + "ic| phrase: Phrase(text='gensim implements TextRank', chunks=[gensim implements TextRank], count=1, rank=0.09162794519014893)\n", + "ic| phrase: Phrase(text='text summarization', chunks=[text summarization], count=1, rank=0.08555365347028678)\n", + "ic| phrase: Phrase(text='ranking webpages', chunks=[ranking webpages], count=1, rank=0.07894442579092492)\n", + "ic| phrase: Phrase(text='algorithm', chunks=[algorithm], count=1, rank=0.07747520663125698)\n" ] } ], @@ -481,16 +484,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "ic| phrase: Phrase(text='sentences', chunks=[sentences], count=1, rank=0.14407775200046075)\n", - "ic| phrase: Phrase(text='Mihalcea et al', chunks=[Mihalcea et al], count=1, rank=0.11286475216345385)\n", - "ic| phrase: Phrase(text='gensim implements TextRank', chunks=[gensim implements TextRank], count=1, rank=0.09589788430130489)\n", - "ic| phrase: Phrase(text='text summarization', chunks=[text summarization], count=1, rank=0.09004754289053603)\n", - "ic| phrase: Phrase(text='ranking webpages', chunks=[ranking webpages], count=1, rank=0.08473538778364878)\n", - "ic| phrase: Phrase(text='every other sentence', chunks=[every other sentence], count=1, rank=0.07909136977858265)\n", - "ic| phrase: Phrase(text='Okapi BM25 function', chunks=[Okapi BM25 function], count=1, rank=0.07900911166567022)\n", - "ic| phrase: Phrase(text='original TextRank', chunks=[original TextRank], count=1, rank=0.07165073049436399)\n", - "ic| phrase: Phrase(text='TextRank', chunks=[TextRank, TextRank, TextRank, TextRank], count=4, rank=0.06888311869751775)\n", - "ic| phrase: Phrase(text='every sentence', chunks=[every sentence], count=1, rank=0.06654666312136172)\n" + "ic| phrase: Phrase(text='sentences', chunks=[sentences], count=1, rank=0.1490464677880926)\n", + "ic| phrase: Phrase(text='Mihalcea et al', chunks=[Mihalcea et al], count=1, rank=0.117318519527749)\n", + "ic| phrase: Phrase(text='et al', chunks=[et al], count=1, rank=0.11512161354108796)\n", + "ic| phrase: Phrase(text='Barrios et al', chunks=[Barrios et al], count=1, rank=0.11252482346188267)\n", + "ic| phrase: Phrase(text='gensim implements TextRank', chunks=[gensim implements TextRank], count=1, rank=0.09816426515530181)\n", + "ic| phrase: Phrase(text='text summarization', chunks=[text summarization], count=1, rank=0.09165889278462461)\n", + "ic| phrase: Phrase(text='ranking webpages', chunks=[ranking webpages], count=1, rank=0.08457790386936588)\n", + "ic| phrase: Phrase(text='algorithm', chunks=[algorithm], count=1, rank=0.08300479194058319)\n", + "ic| phrase: Phrase(text='every other sentence', chunks=[every other sentence], count=1, rank=0.08179233228776425)\n", + "ic| phrase: Phrase(text='Okapi BM25 function', chunks=[Okapi BM25 function], count=1, rank=0.07919192237459494)\n" ] } ], @@ -532,16 +535,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "ic| phrase: Phrase(text='sentences', chunks=[sentences, the sentences], count=2, rank=0.14407775200046075)\n", - "ic| phrase: Phrase(text='Mihalcea et al', chunks=[Mihalcea et al], count=1, rank=0.11286475216345385)\n", - "ic| phrase: Phrase(text='gensim implements TextRank', chunks=[gensim implements TextRank], count=1, rank=0.09589788430130489)\n", - "ic| phrase: Phrase(text='text summarization', chunks=[text summarization], count=1, rank=0.09004754289053603)\n", - "ic| phrase: Phrase(text='ranking webpages', chunks=[ranking webpages], count=1, rank=0.08473538778364878)\n", - "ic| phrase: Phrase(text='sentence', chunks=[every sentence, every other sentence], count=2, rank=0.07909136977858265)\n", - "ic| phrase: Phrase(text='Okapi BM25 function', chunks=[Okapi BM25 function], count=1, rank=0.07900911166567022)\n", - "ic| phrase: Phrase(text='original TextRank', chunks=[original TextRank], count=1, rank=0.07165073049436399)\n", - "ic| phrase: Phrase(text='TextRank', chunks=[TextRank, TextRank, TextRank, TextRank], count=4, rank=0.06888311869751775)\n", - "ic| phrase: Phrase(text='two sentences', chunks=[the two sentences, two sentences], count=2, rank=0.06654666312136172)\n" + "ic| phrase: Phrase(text='sentences', chunks=[sentences, the sentences], count=2, rank=0.1490464677880926)\n", + "ic| phrase: Phrase(text='Mihalcea et al', chunks=[Mihalcea et al], count=1, rank=0.117318519527749)\n", + "ic| phrase: Phrase(text='et al', chunks=[et al], count=1, rank=0.11512161354108796)\n", + "ic| phrase: Phrase(text='Barrios et al', chunks=[Barrios et al], count=1, rank=0.11252482346188267)\n", + "ic| phrase: Phrase(text='gensim implements TextRank', chunks=[gensim implements TextRank], count=1, rank=0.09816426515530181)\n", + "ic| phrase: Phrase(text='text summarization', chunks=[text summarization], count=1, rank=0.09165889278462461)\n", + "ic| phrase: Phrase(text='ranking webpages', chunks=[ranking webpages], count=1, rank=0.08457790386936588)\n", + "ic| phrase: Phrase(text='algorithm', chunks=[algorithm], count=1, rank=0.08300479194058319)\n", + "ic| phrase: Phrase(text='sentence', chunks=[every sentence, every other sentence], count=2, rank=0.08179233228776425)\n", + "ic| phrase: Phrase(text='Okapi BM25 function', chunks=[Okapi BM25 function], count=1, rank=0.07919192237459494)\n" ] } ], @@ -610,16 +613,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "ic| phrase: Phrase(text='sentences', chunks=[sentences, the sentences], count=2, rank=0.14407775200046075)\n", - "ic| phrase: Phrase(text='Mihalcea et al', chunks=[Mihalcea et al], count=1, rank=0.11286475216345385)\n", - "ic| phrase: Phrase(text='gensim implements TextRank', chunks=[gensim implements TextRank], count=1, rank=0.09589788430130489)\n", - "ic| phrase: Phrase(text='text summarization', chunks=[text summarization], count=1, rank=0.09004754289053603)\n", - "ic| phrase: Phrase(text='ranking webpages', chunks=[ranking webpages], count=1, rank=0.08473538778364878)\n", - "ic| phrase: Phrase(text='other sentence', chunks=[every other sentence], count=1, rank=0.07909136977858265)\n", - "ic| phrase: Phrase(text='Okapi BM25 function', chunks=[Okapi BM25 function], count=1, rank=0.07900911166567022)\n", - "ic| phrase: Phrase(text='original TextRank', chunks=[original TextRank], count=1, rank=0.07165073049436399)\n", - "ic| phrase: Phrase(text='TextRank', chunks=[TextRank, TextRank, TextRank, TextRank], count=4, rank=0.06888311869751775)\n", - "ic| phrase: Phrase(text='sentence', chunks=[every sentence], count=1, rank=0.06654666312136172)\n" + "ic| phrase: Phrase(text='sentences', chunks=[sentences, the sentences], count=2, rank=0.1490464677880926)\n", + "ic| phrase: Phrase(text='Mihalcea et al', chunks=[Mihalcea et al], count=1, rank=0.117318519527749)\n", + "ic| phrase: Phrase(text='et al', chunks=[et al], count=1, rank=0.11512161354108796)\n", + "ic| phrase: Phrase(text='Barrios et al', chunks=[Barrios et al], count=1, rank=0.11252482346188267)\n", + "ic| phrase: Phrase(text='gensim implements TextRank', chunks=[gensim implements TextRank], count=1, rank=0.09816426515530181)\n", + "ic| phrase: Phrase(text='text summarization', chunks=[text summarization], count=1, rank=0.09165889278462461)\n", + "ic| phrase: Phrase(text='ranking webpages', chunks=[ranking webpages], count=1, rank=0.08457790386936588)\n", + "ic| phrase: Phrase(text='algorithm', chunks=[algorithm], count=1, rank=0.08300479194058319)\n", + "ic| phrase: Phrase(text='other sentence', chunks=[every other sentence], count=1, rank=0.08179233228776425)\n", + "ic| phrase: Phrase(text='Okapi BM25 function', chunks=[Okapi BM25 function], count=1, rank=0.07919192237459494)\n" ] } ], @@ -668,15 +671,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "ic| phrase: Phrase(text='sentences', chunks=[sentences], count=1, rank=0.14407775200046075)\n", - "ic| phrase: Phrase(text='gensim implements TextRank', chunks=[gensim implements TextRank], count=1, rank=0.09589788430130489)\n", - "ic| phrase: Phrase(text='text summarization', chunks=[text summarization], count=1, rank=0.09004754289053603)\n", - "ic| phrase: Phrase(text='ranking webpages', chunks=[ranking webpages], count=1, rank=0.08473538778364878)\n", - "ic| phrase: Phrase(text='every other sentence', chunks=[every other sentence], count=1, rank=0.07909136977858265)\n", - "ic| phrase: Phrase(text='Okapi BM25 function', chunks=[Okapi BM25 function], count=1, rank=0.07900911166567022)\n", - "ic| phrase: Phrase(text='original TextRank', chunks=[original TextRank], count=1, rank=0.07165073049436399)\n", - "ic| phrase: Phrase(text='every sentence', chunks=[every sentence], count=1, rank=0.06654666312136172)\n", - "ic| phrase: Phrase(text='the sentences', chunks=[the sentences], count=1, rank=0.06654666312136172)\n" + "ic| phrase: Phrase(text='sentences', chunks=[sentences], count=1, rank=0.1490464677880926)\n", + "ic| phrase: Phrase(text='gensim implements TextRank', chunks=[gensim implements TextRank], count=1, rank=0.09816426515530181)\n", + "ic| phrase: Phrase(text='text summarization', chunks=[text summarization], count=1, rank=0.09165889278462461)\n", + "ic| phrase: Phrase(text='ranking webpages', chunks=[ranking webpages], count=1, rank=0.08457790386936588)\n", + "ic| phrase: Phrase(text='algorithm', chunks=[algorithm], count=1, rank=0.08300479194058319)\n", + "ic| phrase: Phrase(text='every other sentence', chunks=[every other sentence], count=1, rank=0.08179233228776425)\n", + "ic| phrase: Phrase(text='Okapi BM25 function', chunks=[Okapi BM25 function], count=1, rank=0.07919192237459494)\n", + "ic| phrase: Phrase(text='original TextRank', chunks=[original TextRank], count=1, rank=0.07346227481329015)\n", + "ic| phrase: Phrase(text='TextRank', chunks=[TextRank], count=1, rank=0.07058237377923389)\n" ] } ], @@ -722,7 +725,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "-rw-r--r-- 1 paco staff 17K Mar 6 14:39 lemma_graph.dot\n" + "-rw-rw-r-- 1 ankushchander ankushchander 17K Jun 4 11:56 lemma_graph.dot\r\n" ] } ], @@ -742,7 +745,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Requirement already satisfied: graphviz in /Users/paco/src/pytextrank/venv/lib/python3.7/site-packages (0.17)\n" + "Requirement already satisfied: graphviz in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (0.19.1)\r\n" ] } ], @@ -778,1656 +781,1621 @@ "\n", "\n", - "\n", - "\n", - "\n", - "\n", - "\n", + "\n", + "\n", + "\n", + "%3\n", + "\n", "\n", "\n", "('quick', 'ADJ')\n", - "\n", - "('quick', 'ADJ') (0.0041)\n", + "\n", + "('quick', 'ADJ') (0.0044)\n", "\n", "\n", "\n", "('description', 'NOUN')\n", - "\n", - "('description', 'NOUN') (0.0053)\n", + "\n", + "('description', 'NOUN') (0.0056)\n", "\n", "\n", "\n", "('quick', 'ADJ')->('description', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('popular', 'ADJ')\n", - "\n", - "('popular', 'ADJ') (0.0095)\n", + "\n", + "('popular', 'ADJ') (0.0100)\n", "\n", "\n", "\n", "('quick', 'ADJ')->('popular', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('algorithm', 'NOUN')\n", - "\n", - "('algorithm', 'NOUN') (0.0114)\n", + "\n", + "('algorithm', 'NOUN') (0.0276)\n", "\n", "\n", "\n", "('quick', 'ADJ')->('algorithm', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('description', 'NOUN')->('popular', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('description', 'NOUN')->('algorithm', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('implementation', 'NOUN')\n", - "\n", - "('implementation', 'NOUN') (0.0094)\n", + "\n", + "('implementation', 'NOUN') (0.0097)\n", "\n", "\n", "\n", "('description', 'NOUN')->('implementation', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('popular', 'ADJ')->('algorithm', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('popular', 'ADJ')->('implementation', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('text', 'NOUN')\n", - "\n", - "('text', 'NOUN') (0.0151)\n", + "\n", + "('text', 'NOUN') (0.0157)\n", "\n", "\n", "\n", "('popular', 'ADJ')->('text', 'NOUN')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('algorithm', 'PROPN')\n", - "\n", - "('algorithm', 'PROPN') (0.0193)\n", - "\n", - "\n", - "\n", - "('popular', 'ADJ')->('algorithm', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('PageRank', 'PROPN')\n", - "\n", - "('PageRank', 'PROPN') (0.0149)\n", + "\n", + "('PageRank', 'PROPN') (0.0157)\n", "\n", "\n", "\n", "('popular', 'ADJ')->('PageRank', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Google', 'PROPN')\n", - "\n", - "('Google', 'PROPN') (0.0103)\n", + "\n", + "('Google', 'PROPN') (0.0103)\n", "\n", "\n", - "\n", + "\n", "('popular', 'ADJ')->('Google', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('algorithm', 'NOUN')->('implementation', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('algorithm', 'NOUN')->('text', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('summarization', 'NOUN')\n", - "\n", - "('summarization', 'NOUN') (0.0214)\n", + "\n", + "('summarization', 'NOUN') (0.0221)\n", "\n", "\n", - "\n", + "\n", "('algorithm', 'NOUN')->('summarization', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('base', 'VERB')\n", + "\n", + "('base', 'VERB') (0.0124)\n", + "\n", + "\n", + "\n", + "('algorithm', 'NOUN')->('base', 'VERB')\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('weight', 'VERB')\n", + "\n", + "('weight', 'VERB') (0.0144)\n", + "\n", + "\n", + "\n", + "('algorithm', 'NOUN')->('weight', 'VERB')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('graph', 'NOUN')\n", - "\n", - "('graph', 'NOUN') (0.0110)\n", + "\n", + "('graph', 'NOUN') (0.0213)\n", "\n", "\n", - "\n", + "\n", "('algorithm', 'NOUN')->('graph', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('algorithm', 'NOUN')->('Google', 'PROPN')\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('use', 'VERB')\n", + "\n", + "('use', 'VERB') (0.0146)\n", + "\n", + "\n", + "\n", + "('algorithm', 'NOUN')->('use', 'VERB')\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('rank', 'VERB')\n", + "\n", + "('rank', 'VERB') (0.0121)\n", + "\n", + "\n", + "\n", + "('algorithm', 'NOUN')->('rank', 'VERB')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('implementation', 'NOUN')->('text', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('implementation', 'NOUN')->('summarization', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('exist', 'VERB')\n", - "\n", - "('exist', 'VERB') (0.0120)\n", + "\n", + "('exist', 'VERB') (0.0125)\n", "\n", "\n", - "\n", + "\n", "('implementation', 'NOUN')->('exist', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('text', 'NOUN')->('summarization', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('text', 'NOUN')->('exist', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('today', 'NOUN')\n", - "\n", - "('today', 'NOUN') (0.0127)\n", + "\n", + "('today', 'NOUN') (0.0132)\n", "\n", "\n", - "\n", + "\n", "('text', 'NOUN')->('today', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('remove', 'VERB')\n", - "\n", - "('remove', 'VERB') (0.0097)\n", + "\n", + "('remove', 'VERB') (0.0102)\n", "\n", "\n", - "\n", + "\n", "('text', 'NOUN')->('remove', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('stop', 'VERB')\n", - "\n", - "('stop', 'VERB') (0.0109)\n", + "\n", + "('stop', 'VERB') (0.0115)\n", "\n", "\n", - "\n", + "\n", "('text', 'NOUN')->('stop', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('stem', 'VERB')\n", - "\n", - "('stem', 'VERB') (0.0137)\n", + "\n", + "('stem', 'VERB') (0.0144)\n", "\n", "\n", - "\n", + "\n", "('text', 'NOUN')->('stem', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('summarization', 'NOUN')->('summarization', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('summarization', 'NOUN')->('exist', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('summarization', 'NOUN')->('today', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('module', 'NOUN')\n", - "\n", - "('module', 'NOUN') (0.0141)\n", + "\n", + "('module', 'NOUN') (0.0148)\n", "\n", "\n", - "\n", + "\n", "('summarization', 'NOUN')->('module', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('gensim', 'NOUN')\n", - "\n", - "('gensim', 'NOUN') (0.0147)\n", + "\n", + "('gensim', 'NOUN') (0.0154)\n", "\n", "\n", - "\n", + "\n", "('summarization', 'NOUN')->('gensim', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('implement', 'NOUN')\n", - "\n", - "('implement', 'NOUN') (0.0153)\n", + "\n", + "('implement', 'NOUN') (0.0160)\n", "\n", "\n", - "\n", + "\n", "('summarization', 'NOUN')->('implement', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('exist', 'VERB')->('summarization', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('exist', 'VERB')->('today', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('exist', 'VERB')->('module', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('today', 'NOUN')->('summarization', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('today', 'NOUN')->('module', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('today', 'NOUN')->('gensim', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('module', 'NOUN')->('gensim', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('module', 'NOUN')->('implement', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('TextRank', 'PROPN')\n", - "\n", - "('TextRank', 'PROPN') (0.0190)\n", + "\n", + "('TextRank', 'PROPN') (0.0199)\n", "\n", "\n", - "\n", + "\n", "('module', 'NOUN')->('TextRank', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('gensim', 'NOUN')->('implement', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('gensim', 'NOUN')->('TextRank', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('unsupervised', 'ADJ')\n", - "\n", - "('unsupervised', 'ADJ') (0.0144)\n", + "\n", + "('unsupervised', 'ADJ') (0.0148)\n", "\n", "\n", - "\n", + "\n", "('gensim', 'NOUN')->('unsupervised', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('implement', 'NOUN')->('algorithm', 'NOUN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('implement', 'NOUN')->('TextRank', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('implement', 'NOUN')->('unsupervised', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "('implement', 'NOUN')->('algorithm', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", + "('TextRank', 'PROPN')->('algorithm', 'NOUN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('TextRank', 'PROPN')->('unsupervised', 'ADJ')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('TextRank', 'PROPN')->('algorithm', 'PROPN')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('base', 'VERB')\n", - "\n", - "('base', 'VERB') (0.0127)\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('TextRank', 'PROPN')->('base', 'VERB')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('use', 'VERB')\n", - "\n", - "('use', 'VERB') (0.0148)\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('TextRank', 'PROPN')->('use', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('work', 'VERB')\n", + "\n", + "('work', 'VERB') (0.0059)\n", + "\n", + "\n", + "\n", + "('TextRank', 'PROPN')->('work', 'VERB')\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('follow', 'VERB')\n", + "\n", + "('follow', 'VERB') (0.0109)\n", + "\n", + "\n", + "\n", + "('TextRank', 'PROPN')->('follow', 'VERB')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('sentence', 'NOUN')\n", - "\n", - "('sentence', 'NOUN') (0.0830)\n", + "\n", + "('sentence', 'NOUN') (0.0889)\n", "\n", "\n", - "\n", + "\n", "('TextRank', 'PROPN')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('edge', 'NOUN')\n", - "\n", - "('edge', 'NOUN') (0.0399)\n", + "\n", + "('edge', 'NOUN') (0.0421)\n", "\n", "\n", - "\n", + "\n", "('TextRank', 'PROPN')->('edge', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('weight', 'NOUN')\n", - "\n", - "('weight', 'NOUN') (0.0071)\n", + "\n", + "('weight', 'NOUN') (0.0071)\n", "\n", "\n", - "\n", + "\n", "('TextRank', 'PROPN')->('weight', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Okapi', 'PROPN')\n", - "\n", - "('Okapi', 'PROPN') (0.0096)\n", + "\n", + "('Okapi', 'PROPN') (0.0096)\n", "\n", "\n", - "\n", + "\n", "('TextRank', 'PROPN')->('Okapi', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('BM25', 'PROPN')\n", - "\n", - "('BM25', 'PROPN') (0.0112)\n", + "\n", + "('BM25', 'PROPN') (0.0111)\n", "\n", "\n", - "\n", + "\n", "('TextRank', 'PROPN')->('BM25', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "('unsupervised', 'ADJ')->('algorithm', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", + "('unsupervised', 'ADJ')->('algorithm', 'NOUN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('unsupervised', 'ADJ')->('base', 'VERB')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('weight', 'VERB')\n", - "\n", - "('weight', 'VERB') (0.0146)\n", + "\n", + "\n", "\n", "\n", - "\n", - "('unsupervised', 'ADJ')->('weight', 'VERB')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('algorithm', 'PROPN')->('base', 'VERB')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('algorithm', 'PROPN')->('weight', 'VERB')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('graphs', 'NOUN')\n", - "\n", - "('graphs', 'NOUN') (0.0146)\n", - "\n", - "\n", - "\n", - "('algorithm', 'PROPN')->('graphs', 'NOUN')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('algorithm', 'PROPN')->('Google', 'PROPN')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('algorithm', 'PROPN')->('use', 'VERB')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('rank', 'VERB')\n", - "\n", - "('rank', 'VERB') (0.0123)\n", - "\n", - "\n", "\n", - "('algorithm', 'PROPN')->('rank', 'VERB')\n", - "\n", - "\n", + "('unsupervised', 'ADJ')->('weight', 'VERB')\n", + "\n", + "\n", "\n", "\n", "\n", "('base', 'VERB')->('weight', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", + "\n", "\n", - "('base', 'VERB')->('graphs', 'NOUN')\n", - "\n", - "\n", + "('base', 'VERB')->('graph', 'NOUN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('paper', 'NOUN')\n", - "\n", - "('paper', 'NOUN') (0.0172)\n", + "\n", + "('paper', 'NOUN') (0.0162)\n", "\n", "\n", "\n", "('base', 'VERB')->('paper', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", + "\n", "\n", - "('weight', 'VERB')->('graphs', 'NOUN')\n", - "\n", - "\n", + "('weight', 'VERB')->('graph', 'NOUN')\n", + "\n", + "\n", "\n", "\n", "\n", "('weight', 'VERB')->('paper', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Mihalcea', 'PROPN')\n", - "\n", - "('Mihalcea', 'PROPN') (0.0148)\n", + "\n", + "('Mihalcea', 'PROPN') (0.0138)\n", "\n", "\n", "\n", "('weight', 'VERB')->('Mihalcea', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", + "\n", "\n", - "('graphs', 'NOUN')->('paper', 'NOUN')\n", - "\n", - "\n", + "('graph', 'NOUN')->('paper', 'NOUN')\n", + "\n", + "\n", "\n", - "\n", + "\n", "\n", - "('graphs', 'NOUN')->('Mihalcea', 'PROPN')\n", - "\n", - "\n", + "('graph', 'NOUN')->('Mihalcea', 'PROPN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('et', 'PROPN')\n", - "\n", - "('et', 'PROPN') (0.0228)\n", + "\n", + "('et', 'PROPN') (0.0224)\n", "\n", - "\n", + "\n", "\n", - "('graphs', 'NOUN')->('et', 'PROPN')\n", - "\n", - "\n", + "('graph', 'NOUN')->('et', 'PROPN')\n", + "\n", + "\n", "\n", - "\n", + "\n", + "\n", + "('vertex', 'NOUN')\n", + "\n", + "('vertex', 'NOUN') (0.0086)\n", + "\n", + "\n", "\n", + "('graph', 'NOUN')->('vertex', 'NOUN')\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('be', 'VERB')\n", + "\n", + "('be', 'VERB') (0.0123)\n", + "\n", + "\n", + "\n", + "('graph', 'NOUN')->('be', 'VERB')\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('graph', 'NOUN')->('sentence', 'NOUN')\n", + "\n", + "\n", + "\n", + "\n", + "\n", "('paper', 'NOUN')->('Mihalcea', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('paper', 'NOUN')->('et', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('al', 'PROPN')\n", - "\n", - "('al', 'PROPN') (0.0304)\n", + "\n", + "('al', 'PROPN') (0.0372)\n", "\n", "\n", - "\n", + "\n", "('paper', 'NOUN')->('al', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Barrios', 'PROPN')\n", - "\n", - "('Barrios', 'PROPN') (0.0077)\n", + "\n", + "('Barrios', 'PROPN') (0.0079)\n", "\n", "\n", - "\n", + "\n", "('paper', 'NOUN')->('Barrios', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Mihalcea', 'PROPN')->('et', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Mihalcea', 'PROPN')->('al', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('et', 'PROPN')->('al', 'PROPN')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('.', 'PROPN')\n", - "\n", - "('.', 'PROPN') (0.0386)\n", - "\n", - "\n", - "\n", - "('et', 'PROPN')->('.', 'PROPN')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('al', 'PROPN')->('.', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('add', 'VERB')\n", - "\n", - "('add', 'VERB') (0.0041)\n", + "\n", + "('add', 'VERB') (0.0044)\n", "\n", "\n", - "\n", + "\n", "('incubator', 'NOUN')\n", - "\n", - "('incubator', 'NOUN') (0.0053)\n", + "\n", + "('incubator', 'NOUN') (0.0056)\n", "\n", "\n", - "\n", + "\n", "('add', 'VERB')->('incubator', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('student', 'NOUN')\n", - "\n", - "('student', 'NOUN') (0.0068)\n", + "\n", + "('student', 'NOUN') (0.0072)\n", "\n", "\n", - "\n", + "\n", "('add', 'VERB')->('student', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Olavur', 'PROPN')\n", - "\n", - "('Olavur', 'PROPN') (0.0087)\n", + "\n", + "('Olavur', 'PROPN') (0.0092)\n", "\n", "\n", - "\n", + "\n", "('add', 'VERB')->('Olavur', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('incubator', 'NOUN')->('student', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('incubator', 'NOUN')->('Olavur', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Mortensen', 'PROPN')\n", - "\n", - "('Mortensen', 'PROPN') (0.0100)\n", + "\n", + "('Mortensen', 'PROPN') (0.0106)\n", "\n", "\n", - "\n", + "\n", "('incubator', 'NOUN')->('Mortensen', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('student', 'NOUN')->('Olavur', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('student', 'NOUN')->('Mortensen', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('see', 'VERB')\n", - "\n", - "('see', 'VERB') (0.0208)\n", + "\n", + "('see', 'VERB') (0.0215)\n", "\n", "\n", - "\n", + "\n", "('student', 'NOUN')->('see', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Olavur', 'PROPN')->('Mortensen', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Olavur', 'PROPN')->('see', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('previous', 'ADJ')\n", - "\n", - "('previous', 'ADJ') (0.0130)\n", + "\n", + "('previous', 'ADJ') (0.0136)\n", "\n", "\n", - "\n", + "\n", "('Olavur', 'PROPN')->('previous', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Mortensen', 'PROPN')->('see', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Mortensen', 'PROPN')->('previous', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('post', 'NOUN')\n", - "\n", - "('post', 'NOUN') (0.0160)\n", + "\n", + "('post', 'NOUN') (0.0168)\n", "\n", "\n", - "\n", + "\n", "('Mortensen', 'PROPN')->('post', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('see', 'VERB')->('previous', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('see', 'VERB')->('post', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('blog', 'NOUN')\n", - "\n", - "('blog', 'NOUN') (0.0268)\n", + "\n", + "('blog', 'NOUN') (0.0281)\n", "\n", "\n", - "\n", + "\n", "('see', 'VERB')->('blog', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('see', 'VERB')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('similar', 'ADJ')\n", - "\n", - "('similar', 'ADJ') (0.0221)\n", + "\n", + "('similar', 'ADJ') (0.0229)\n", "\n", "\n", - "\n", + "\n", "('see', 'VERB')->('similar', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('previous', 'ADJ')->('post', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('previous', 'ADJ')->('blog', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('post', 'NOUN')->('blog', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('build', 'VERB')\n", - "\n", - "('build', 'VERB') (0.0041)\n", + "\n", + "('build', 'VERB') (0.0044)\n", "\n", "\n", - "\n", + "\n", "('build', 'VERB')->('popular', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('top', 'NOUN')\n", - "\n", - "('top', 'NOUN') (0.0053)\n", + "\n", + "('top', 'NOUN') (0.0056)\n", "\n", "\n", - "\n", + "\n", "('build', 'VERB')->('top', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('build', 'VERB')->('PageRank', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('top', 'NOUN')->('popular', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "('top', 'NOUN')->('algorithm', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", + "('top', 'NOUN')->('algorithm', 'NOUN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('top', 'NOUN')->('PageRank', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('PageRank', 'PROPN')->('algorithm', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "('PageRank', 'PROPN')->('algorithm', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", + "('PageRank', 'PROPN')->('graph', 'NOUN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('PageRank', 'PROPN')->('Google', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('PageRank', 'PROPN')->('use', 'VERB')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('PageRank', 'PROPN')->('graph', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('score', 'NOUN')\n", - "\n", - "('score', 'NOUN') (0.0106)\n", + "\n", + "('score', 'NOUN') (0.0112)\n", "\n", "\n", "\n", "('PageRank', 'PROPN')->('score', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('Google', 'PROPN')->('use', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('Google', 'PROPN')->('rank', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('webpage', 'NOUN')\n", - "\n", - "('webpage', 'NOUN') (0.0200)\n", + "\n", + "('webpage', 'NOUN') (0.0201)\n", "\n", "\n", "\n", "('Google', 'PROPN')->('webpage', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('use', 'VERB')->('rank', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('use', 'VERB')->('webpage', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('use', 'VERB')->('Okapi', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('use', 'VERB')->('BM25', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('function', 'NOUN')\n", - "\n", - "('function', 'NOUN') (0.0125)\n", + "\n", + "('function', 'NOUN') (0.0127)\n", "\n", "\n", "\n", "('use', 'VERB')->('function', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "('rank', 'VERB')->('webpage', 'NOUN')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('textrank', 'NOUN')\n", - "\n", - "('textrank', 'NOUN') (0.0041)\n", - "\n", - "\n", - "\n", - "('work', 'VERB')\n", - "\n", - "('work', 'VERB') (0.0059)\n", - "\n", - "\n", - "\n", - "('textrank', 'NOUN')->('work', 'VERB')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('follow', 'VERB')\n", - "\n", - "('follow', 'VERB') (0.0109)\n", - "\n", - "\n", - "\n", - "('textrank', 'NOUN')->('follow', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('work', 'VERB')->('follow', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "('pre', 'NOUN')\n", - "\n", - "('pre', 'NOUN') (0.0041)\n", + "\n", + "\n", + "('Pre', 'NOUN')\n", + "\n", + "('Pre', 'NOUN') (0.0044)\n", "\n", - "\n", - "\n", - "('pre', 'NOUN')->('text', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", + "('Pre', 'NOUN')->('text', 'NOUN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('-', 'NOUN')\n", - "\n", - "('-', 'NOUN') (0.0053)\n", + "\n", + "('-', 'NOUN') (0.0056)\n", "\n", - "\n", - "\n", - "('pre', 'NOUN')->('-', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", + "('Pre', 'NOUN')->('-', 'NOUN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('process', 'VERB')\n", - "\n", - "('process', 'VERB') (0.0068)\n", + "\n", + "('process', 'VERB') (0.0072)\n", "\n", - "\n", - "\n", - "('pre', 'NOUN')->('process', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", + "('Pre', 'NOUN')->('process', 'VERB')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('-', 'NOUN')->('text', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('-', 'NOUN')->('process', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('-', 'NOUN')->('remove', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('process', 'VERB')->('text', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('process', 'VERB')->('remove', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('process', 'VERB')->('stop', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('remove', 'VERB')->('stop', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('remove', 'VERB')->('stem', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('remain', 'VERB')\n", - "\n", - "('remain', 'VERB') (0.0231)\n", + "\n", + "('remain', 'VERB') (0.0243)\n", "\n", "\n", - "\n", + "\n", "('remove', 'VERB')->('remain', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('stop', 'VERB')->('stem', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('stop', 'VERB')->('remain', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('stem', 'VERB')->('remain', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('create', 'VERB')\n", - "\n", - "('create', 'VERB') (0.0041)\n", + "\n", + "('create', 'VERB') (0.0044)\n", "\n", "\n", - "\n", + "\n", "('create', 'VERB')->('graph', 'NOUN')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('vertex', 'NOUN')\n", - "\n", - "('vertex', 'NOUN') (0.0100)\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('create', 'VERB')->('vertex', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "('create', 'VERB')->('sentence', 'NOUN')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('graph', 'NOUN')->('vertex', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", + "('create', 'VERB')->('be', 'VERB')\n", + "\n", + "\n", "\n", - "\n", - "\n", - "('graph', 'NOUN')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", + "('vertex', 'NOUN')->('be', 'VERB')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('vertex', 'NOUN')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('be', 'VERB')->('sentence', 'NOUN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('sentence', 'NOUN')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('other', 'ADJ')\n", - "\n", - "('other', 'ADJ') (0.0171)\n", + "\n", + "('other', 'ADJ') (0.0182)\n", "\n", "\n", - "\n", + "\n", "('sentence', 'NOUN')->('other', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('sentence', 'NOUN')->('edge', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('percentage', 'NOUN')\n", - "\n", - "('percentage', 'NOUN') (0.0237)\n", + "\n", + "('percentage', 'NOUN') (0.0251)\n", "\n", "\n", - "\n", + "\n", "('sentence', 'NOUN')->('percentage', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('appear', 'VERB')\n", - "\n", - "('appear', 'VERB') (0.0428)\n", + "\n", + "('appear', 'VERB') (0.0454)\n", "\n", "\n", - "\n", + "\n", "('sentence', 'NOUN')->('appear', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('connect', 'VERB')\n", - "\n", - "('connect', 'VERB') (0.0041)\n", + "\n", + "('connect', 'VERB') (0.0044)\n", "\n", "\n", - "\n", + "\n", "('connect', 'VERB')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('connect', 'VERB')->('other', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('other', 'ADJ')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('other', 'ADJ')->('edge', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('edge', 'NOUN')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('edge', 'NOUN')->('similar', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('edge', 'NOUN')->('percentage', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('edge', 'NOUN')->('appear', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('weight', 'NOUN')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('weight', 'NOUN')->('edge', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('weight', 'NOUN')->('similar', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('weight', 'NOUN')->('percentage', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('similar', 'ADJ')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('run', 'VERB')\n", - "\n", - "('run', 'VERB') (0.0041)\n", + "\n", + "('run', 'VERB') (0.0044)\n", "\n", "\n", - "\n", + "\n", "('run', 'VERB')->('algorithm', 'NOUN')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('run', 'VERB')->('PageRank', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('run', 'VERB')->('graph', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "('run', 'VERB')->('PageRank', 'PROPN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('pick', 'VERB')\n", - "\n", - "('pick', 'VERB') (0.0041)\n", + "\n", + "('pick', 'VERB') (0.0044)\n", "\n", "\n", - "\n", + "\n", "('pick', 'VERB')->('PageRank', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "('vertices(sentence', 'NOUN')\n", - "\n", - "('vertices(sentence', 'NOUN') (0.0053)\n", + "\n", + "\n", + "('vertices(sentences', 'PROPN')\n", + "\n", + "('vertices(sentences', 'PROPN') (0.0056)\n", "\n", - "\n", - "\n", - "('pick', 'VERB')->('vertices(sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", + "('pick', 'VERB')->('vertices(sentences', 'PROPN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('high', 'ADJ')\n", - "\n", - "('high', 'ADJ') (0.0068)\n", + "\n", + "('high', 'ADJ') (0.0072)\n", "\n", "\n", - "\n", + "\n", "('pick', 'VERB')->('high', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "('vertices(sentence', 'NOUN')->('PageRank', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", + "('vertices(sentences', 'PROPN')->('PageRank', 'PROPN')\n", + "\n", + "\n", "\n", - "\n", - "\n", - "('vertices(sentence', 'NOUN')->('high', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", + "('vertices(sentences', 'PROPN')->('high', 'ADJ')\n", + "\n", + "\n", "\n", - "\n", - "\n", - "('vertices(sentence', 'NOUN')->('score', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", + "('vertices(sentences', 'PROPN')->('score', 'NOUN')\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('high', 'ADJ')->('PageRank', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('high', 'ADJ')->('score', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('original', 'ADJ')\n", - "\n", - "('original', 'ADJ') (0.0041)\n", + "\n", + "('original', 'ADJ') (0.0044)\n", "\n", "\n", - "\n", + "\n", "('original', 'ADJ')->('TextRank', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('original', 'ADJ')->('edge', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('original', 'ADJ')->('weight', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('percentage', 'NOUN')->('appear', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Gensim', 'PROPN')\n", - "\n", - "('Gensim', 'PROPN') (0.0041)\n", + "\n", + "('Gensim', 'PROPN') (0.0044)\n", "\n", "\n", - "\n", + "\n", "('Gensim', 'PROPN')->('TextRank', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Gensim', 'PROPN')->('use', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Gensim', 'PROPN')->('Okapi', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Okapi', 'PROPN')->('see', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Okapi', 'PROPN')->('BM25', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Okapi', 'PROPN')->('function', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('BM25', 'PROPN')->('see', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('BM25', 'PROPN')->('similar', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('BM25', 'PROPN')->('function', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('function', 'NOUN')->('see', 'VERB')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('function', 'NOUN')->('sentence', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('function', 'NOUN')->('similar', 'ADJ')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('improvement', 'NOUN')\n", - "\n", - "('improvement', 'NOUN') (0.0041)\n", + "\n", + "('improvement', 'NOUN') (0.0044)\n", "\n", "\n", - "\n", + "\n", "('improvement', 'NOUN')->('paper', 'NOUN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('improvement', 'NOUN')->('et', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('improvement', 'NOUN')->('Barrios', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Barrios', 'PROPN')->('et', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "('Barrios', 'PROPN')->('al', 'PROPN')\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "('Barrios', 'PROPN')->('.', 'PROPN')\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 19, @@ -2472,22 +2440,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "Requirement already satisfied: altair in /Users/paco/src/pytextrank/venv/lib/python3.7/site-packages (4.1.0)\n", - "Requirement already satisfied: entrypoints in /Users/paco/src/pytextrank/venv/lib/python3.7/site-packages (from altair) (0.3)\n", - "Requirement already satisfied: jinja2 in /Users/paco/src/pytextrank/venv/lib/python3.7/site-packages (from altair) (3.0.2)\n", - "Requirement already satisfied: toolz in /Users/paco/src/pytextrank/venv/lib/python3.7/site-packages (from altair) (0.11.1)\n", - "Requirement already satisfied: pandas>=0.18 in /Users/paco/src/pytextrank/venv/lib/python3.7/site-packages (from altair) (1.3.3)\n", - "Requirement already satisfied: jsonschema in /Users/paco/src/pytextrank/venv/lib/python3.7/site-packages (from altair) (4.1.0)\n", - "Requirement already satisfied: numpy in /Users/paco/src/pytextrank/venv/lib/python3.7/site-packages (from altair) (1.21.2)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /Users/paco/src/pytextrank/venv/lib/python3.7/site-packages (from pandas>=0.18->altair) (2.8.2)\n", - "Requirement already satisfied: pytz>=2017.3 in /Users/paco/src/pytextrank/venv/lib/python3.7/site-packages (from pandas>=0.18->altair) (2021.3)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /Users/paco/src/pytextrank/venv/lib/python3.7/site-packages (from jinja2->altair) (2.0.1)\n", - "Requirement already satisfied: importlib-metadata in /Users/paco/src/pytextrank/venv/lib/python3.7/site-packages (from jsonschema->altair) (4.8.1)\n", - "Requirement already satisfied: attrs>=17.4.0 in /Users/paco/src/pytextrank/venv/lib/python3.7/site-packages (from jsonschema->altair) (21.2.0)\n", - "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /Users/paco/src/pytextrank/venv/lib/python3.7/site-packages (from jsonschema->altair) (0.18.0)\n", - "Requirement already satisfied: six>=1.5 in /Users/paco/src/pytextrank/venv/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas>=0.18->altair) (1.16.0)\n", - "Requirement already satisfied: typing-extensions>=3.6.4 in /Users/paco/src/pytextrank/venv/lib/python3.7/site-packages (from importlib-metadata->jsonschema->altair) (3.10.0.2)\n", - "Requirement already satisfied: zipp>=0.5 in /Users/paco/src/pytextrank/venv/lib/python3.7/site-packages (from importlib-metadata->jsonschema->altair) (3.6.0)\n" + "Requirement already satisfied: altair in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (4.2.0)\n", + "Requirement already satisfied: jsonschema>=3.0 in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from altair) (4.4.0)\n", + "Requirement already satisfied: jinja2 in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from altair) (3.0.3)\n", + "Requirement already satisfied: numpy in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from altair) (1.22.3)\n", + "Requirement already satisfied: entrypoints in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from altair) (0.4)\n", + "Requirement already satisfied: toolz in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from altair) (0.11.2)\n", + "Requirement already satisfied: pandas>=0.18 in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from altair) (1.4.1)\n", + "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from jsonschema>=3.0->altair) (0.18.1)\n", + "Requirement already satisfied: attrs>=17.4.0 in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from jsonschema>=3.0->altair) (21.4.0)\n", + "Requirement already satisfied: importlib-resources>=1.4.0; python_version < \"3.9\" in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from jsonschema>=3.0->altair) (5.4.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from jinja2->altair) (2.1.0)\n", + "Requirement already satisfied: pytz>=2020.1 in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from pandas>=0.18->altair) (2021.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from pandas>=0.18->altair) (2.8.2)\n", + "Requirement already satisfied: zipp>=3.1.0; python_version < \"3.10\" in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from importlib-resources>=1.4.0; python_version < \"3.9\"->jsonschema>=3.0->altair) (3.7.0)\n", + "Requirement already satisfied: six>=1.5 in /home/ankushchander/workplace/.virtualenv/pytextrank/lib/python3.8/site-packages (from python-dateutil>=2.8.1->pandas>=0.18->altair) (1.16.0)\n" ] } ], @@ -2505,29 +2472,36 @@ "data": { "text/html": [ "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -2572,7 +2544,11 @@ { "cell_type": "markdown", "id": "administrative-sleeve", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "## Extractive Summarization\n", "\n", @@ -2583,7 +2559,11 @@ "cell_type": "code", "execution_count": 22, "id": "afraid-retail", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "name": "stderr", @@ -2591,9 +2571,9 @@ "text": [ "ic| sent: First, a quick description of some popular algorithms & implementations for text summarization that exist today: the summarization module in gensim implements TextRank, an unsupervised algorithm based on weighted-graphs from a paper by Mihalcea et al.\n", "ic| sent: Gensim’s TextRank uses Okapi BM25 function to see how similar the sentences are.\n", - "ic| sent: Create a graph where vertices are sentences.\n", "ic| sent: It is built on top of the popular PageRank algorithm that Google used for ranking webpages.\n", - "ic| sent: In original TextRank the weights of an edge between two sentences is the percentage of words appearing in both of them.\n" + "ic| sent: Create a graph where vertices are sentences.\n", + "ic| sent: Run the PageRank algorithm on the graph.\n" ] } ], @@ -2605,7 +2585,11 @@ { "cell_type": "markdown", "id": "02cee695-efab-4264-9738-3f5e9f765bfb", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "## Using TopicRank\n", "\n", @@ -2616,7 +2600,11 @@ "cell_type": "code", "execution_count": 23, "id": "7a77525e-9b74-45ba-af71-21aa8f5849f2", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "nlp = spacy.load(\"en_core_web_sm\")\n", @@ -2626,7 +2614,11 @@ { "cell_type": "markdown", "id": "sophisticated-crossing", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "Let's load an example text:" ] @@ -2635,7 +2627,11 @@ "cell_type": "code", "execution_count": 24, "id": "judicial-andrews", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "data": { @@ -2657,7 +2653,11 @@ "cell_type": "code", "execution_count": 25, "id": "96ca4cfe-1dc0-4e81-a2e8-a60ff295bf39", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "name": "stderr", @@ -2683,6 +2683,99 @@ " ic(phrase)" ] }, + { + "cell_type": "markdown", + "id": "7865c45a", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Using Biased TextRank\n", + "\n", + "The *Biased TextRank* enhanced algorithm is simple to use in the `spaCy` pipeline and it supports the other features described above:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "d4d2581b", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "nlp = spacy.load(\"en_core_web_sm\")\n", + "nlp.add_pipe(\"biasedtextrank\");" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "a83d4003", + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ic| phrase: Phrase(text='Leicester', chunks=[Leicester, Leicester], count=2, rank=0.26184834028994514)\n", + "ic| phrase: Phrase(text='Saturday', chunks=[Saturday, Saturday], count=2, rank=0.13938186779355857)\n", + "ic| phrase: Phrase(text='the last 13 Premier League matches', chunks=[the last 13 Premier League matches], count=1, rank=0.12502820319236171)\n", + "ic| phrase: Phrase(text='none', chunks=[none], count=1, rank=1.9498221604845646e-07)\n", + "ic| phrase: Phrase(text='Moussa Dembele', chunks=[Moussa Dembele, Moussa Dembele], count=2, rank=8.640024414329197e-08)\n", + "ic| phrase: Phrase(text='Dries Mertens', chunks=[Dries Mertens, Dries Mertens], count=2, rank=5.152284728493906e-08)\n", + "ic| phrase: Phrase(text='Edinson Cavani', chunks=[Edinson Cavani], count=1, rank=3.076049036231119e-08)\n", + "ic| phrase: Phrase(text='a new centre', chunks=[a new centre], count=1, rank=2.7737546970070932e-08)\n", + "ic| phrase: Phrase(text='deadline day', chunks=[deadline day, deadline day], count=2, rank=1.3752326412669907e-08)\n", + "ic| phrase: Phrase(text='their long search', chunks=[their long search], count=1, rank=1.1267201943238505e-08)\n" + ] + } + ], + "source": [ + "doc = nlp(text)\n", + "\n", + "focus = \"Leicester\"\n", + "doc._.textrank.change_focus(focus,bias=10.0, default_bias=0.0)\n", + "\n", + "for phrase in doc._.phrases[:10]:\n", + " ic(phrase)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "The top-ranked phrases from *Biased TextRank* are closely related to the \"focus\" item: `Leicester`" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Using PositionRank\n", + "\n", + "The *PositionRank* enhanced algorithm is simple to use in the `spaCy` pipeline and it supports the other features described above:" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, { "cell_type": "markdown", "id": "inner-acceptance", @@ -2695,7 +2788,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 28, "id": "stock-millennium", "metadata": {}, "outputs": [], @@ -2706,7 +2799,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 29, "id": "compact-retention", "metadata": { "scrolled": true, @@ -2757,7 +2850,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 30, "id": "unique-domestic", "metadata": { "scrolled": true, @@ -2773,11 +2866,11 @@ "ic| phrase: Phrase(text='none', chunks=[none], count=1, rank=0.09802416183300769)\n", "ic| phrase: Phrase(text='Moussa Dembele', chunks=[Moussa Dembele, Moussa Dembele], count=2, rank=0.09341044332809736)\n", "ic| phrase: Phrase(text='deadline day', chunks=[deadline day, deadline day], count=2, rank=0.09046182507994752)\n", - "ic| phrase: Phrase(text='Dries Mertens', chunks=[Dries Mertens], count=1, rank=0.08919649435994934)\n", - "ic| phrase: Phrase(text='Edinson Cavani', chunks=[Edinson Cavani, Edinson Cavani], count=2, rank=0.08418633972470349)\n", + "ic| phrase: Phrase(text='Dries Mertens', chunks=[Dries Mertens, Dries Mertens], count=2, rank=0.08919649435994934)\n", + "ic| phrase: Phrase(text='Edinson Cavani', chunks=[Edinson Cavani], count=1, rank=0.08418633972470349)\n", + "ic| phrase: Phrase(text='Shanghai Shenhua', chunks=[Shanghai Shenhua], count=1, rank=0.08254442709505862)\n", "ic| phrase: Phrase(text='Salomon Rondón', chunks=[Salomon Rondón, Salomon Rondón], count=2, rank=0.08228367707127111)\n", - "ic| phrase: Phrase(text='Salomón Rondón', chunks=[Salomón Rondón, Salomón Rondón], count=2, rank=0.08228367707127111)\n", - "ic| phrase: Phrase(text='Premier League', chunks=[Premier League], count=1, rank=0.08198820712767878)\n" + "ic| phrase: Phrase(text='Salomón Rondón', chunks=[Salomón Rondón, Salomón Rondón], count=2, rank=0.08228367707127111)\n" ] } ], @@ -2815,7 +2908,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.9" + "version": "3.8.10" } }, "nbformat": 4,