-
Notifications
You must be signed in to change notification settings - Fork 0
/
intersection.py
114 lines (90 loc) · 3.73 KB
/
intersection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""
Provides a summary of multi-paragraph text
using an algorithm described at
http://thetokenizer.com/2013/04/28/build-your-own-summary-tool/
Adapted from Shlomi Babluki's code here:
https://gist.github.com/shlomibabluki/5473521
"""
from summarize import SummarizerBase
import re
class IntersectionSummarizer(SummarizerBase):
def sentence_to_set(self, sent):
sent = self.split_content_to_tokens(sent)
sent = self.filter_stopwords(sent)
sent = self.stem(sent)
return set(sent)
def sentences_intersection(self, s1, s2):
# If there is not intersection, just return 0
if (len(s1) + len(s2)) == 0:
return 0
# We normalize the result by the average number of words
return len(s1.intersection(s2)) / ((len(s1) + len(s2)) / 2)
# Format a sentence - remove all non-alphanumeric chars from the sentence
# We'll use the formatted sentence as a key in our sentences dictionary
def format_sentence(self, sentence):
sentence = re.sub(r'\W+', '', sentence)
return sentence
# Convert the content into a dictionary <K, V>
# k = The formatted sentence
# V = The rank of the sentence
def get_sentences_ranks(self, content):
# Split the content into sentences
sentences = self.split_content_to_sentences(content)
# Calculate the intersection of every two sentences
n = len(sentences)
values = [[0 for x in range(n)] for x in range(n)]
# Create sentence sets before intersections for speed.
sentence_sets = [self.sentence_to_set(x) for x in sentences]
for i in range(0, n):
for j in range(0, n):
values[i][j] = self.sentences_intersection(
sentence_sets[i],
sentence_sets[j]
)
# Build the sentences dictionary
# The score of a sentences is the sum of all its intersection
sentences_dic = {}
for i in range(0, n):
score = 0
for j in range(0, n):
if i == j:
continue
score += values[i][j]
sentences_dic[self.format_sentence(sentences[i])] = score
return sentences_dic
# Return the best sentence in a paragraph
def get_best_sentence(self, paragraph, sentences_dic):
# Split the paragraph into sentences
sentences = self.split_content_to_sentences(paragraph)
# Ignore short paragraphs
if len(sentences) < 2:
return ""
# Get the best sentence according to the sentences dictionary
best_sentence = ""
max_value = 0
for s in sentences:
strip_s = self.format_sentence(s)
if strip_s:
if sentences_dic[strip_s] > max_value:
max_value = sentences_dic[strip_s]
best_sentence = s
return best_sentence
def summarize(self, input, max_length=10):
sentences_dic = self.get_sentences_ranks(input)
# Split the content into paragraphs.
paragraphs = self.split_content_to_paragraphs(input)
summary = []
# Add the best sentence from each paragraph.
for p in paragraphs:
sentence = self.get_best_sentence(p, sentences_dic).strip()
if sentence:
summary.append(sentence)
# If the summary is still too long, summarize the summary.
if len(summary) > max_length:
new_content = ''
for i in range(max_length):
n = int(len(summary)/max_length)
new_content += ' '.join(summary[i*n:(i+1)*n])
new_content += '\n\n'
return self.summarize(new_content)
return ("\n").join(summary)