-
Notifications
You must be signed in to change notification settings - Fork 0
/
main_st.py
117 lines (87 loc) · 3.46 KB
/
main_st.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
This script receives two files from console, and compare all sentences in them,
searching for meaning similarity.
For that, it supposes that all "li" objects, have sentences. It recopiles
all sentences, and cross each file using cosine similarity. Finally, it saves
all sentences with results upper a confidence value, in a file.
"""
import sys
import markdown
import io
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer, util
def get_all_sentences(book):
"""
Get all sentences of a book.
"""
f = io.open(book, mode="r", encoding="utf-8")
html_content = markdown.markdown(f.read())
# Converts markdown in html
soup = BeautifulSoup(html_content, "html.parser")
lis = soup.findAll('li')
# Opens HTML content with bs4 and gets all li objects
sentences = list()
for li in lis:
sentences.append(li.text)
# Saves all sentences in a list and returns it
f.close()
return sentences
def compare_sentences_list(model, confidence, sentences_book_1, sentences_book_2):
sentences = sentences_book_1.copy()
sentences.extend(sentences_book_2)
# Join both lists
lista = util.paraphrase_mining(model, sentences)
# Applies model
list1_start = 0
list1_end = len(sentences_book_1)
list2_start = list1_end + 1
list2_end = list2_start + len(sentences_book_2)
# Next part is to discard sentences of the same list
# See explanation on paraphrase_mining docs
# https://www.sbert.net/docs/package_reference/util.html#sentence_transformers.util.paraphrase_mining
old_lista = lista.copy()
for item in old_lista:
cos_sim = item[0]
index1 = item[1]
index2 = item[2]
if index1 >= list1_start and index1 <= list1_end:
if index2 > list1_start and index2 <= list1_end:
# Estan en la misma lista, en la lista1 -> descarto
# Sentences in the same list, list1, discard
lista.pop(lista.index(item))
continue
if index1 >= list2_start and index1 <= list2_end:
if index2 > list2_start and index2 <= list2_end:
# Sentences in the same list, list2, discard
lista.pop(lista.index(item))
continue
# Up to this, I have all sentences of differents datasets, compared
# with cosine similarity
new_sentences_list = list()
for item in lista:
cos_sim = item[0]
if cos_sim > confidence:
new_list = [item[0], sentences_book_1[item[1]], sentences_book_1[item[2] - list1_end]]
new_sentences_list.append(new_list)
# Up to this I have all sentences with a cosine similarity upper
# a certain confidence
return new_sentences_list
def save_new_list(filepath, sentences_list):
"""
Saves similar sentences and cosine similarity metric in a file.
"""
f = io.open(filepath, mode="w", encoding="utf-8")
for sentences in sentences_list:
f.write(f"{sentences[0]}\n{sentences[1]}\n{sentences[2]}\n")
f.close()
def main():
books = sys.argv[1:]
sentences_book_1 = get_all_sentences(books[0])
sentences_book_2 = get_all_sentences(books[1])
model = SentenceTransformer('paraphrase-MiniLM-L12-v2')
CONFIDENCE = 0.7
new_sentences_list = compare_sentences_list(model, CONFIDENCE, sentences_book_1, sentences_book_2)
FILEPATH = "data/metric"
save_new_list(FILEPATH, new_sentences_list)
if __name__ == "__main__":
main()