-
Notifications
You must be signed in to change notification settings - Fork 4
/
ex18-TF-IDF
75 lines (60 loc) · 2.66 KB
/
ex18-TF-IDF
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python
# --------------------------------------------------------------------------------------------------------- Intermediate
# DATA BLOCK
text = '''he really really loves coffee
my sister dislikes coffee
my sister loves tea'''
import math
def main(text):
# split the text first into lines and then into lists of words
docs = [line.split() for line in text.splitlines()]
N = len(docs)
# create the vocabulary: the list of words that appear at least once
vocabulary = list(set(text.split()))
df = {}
tf = {}
for word in vocabulary:
# tf: number of occurrences of word w in document divided by document length
# note: tf[word] will be a list containing the tf of each word for each document
# for example tf['he'][0] contains the term frequence of the word 'he' in the first
# document
tf[word] = [doc.count(word)/len(doc) for doc in docs]
# df: number of documents containing word w
df[word] = sum([word in doc for doc in docs])/N
# loop through documents to calculate the tf-idf values
for doc_index, doc in enumerate(docs):
tfidf = []
for word in vocabulary:
tf_idf_val = tf[word][doc_index]*math.log(1/df[word],10)
tfidf.append(tf_idf_val)
print(tfidf) # Intermediate section ends here
main(text)
# --------------------------------------------------------------------------------------------------------- Advanced
text = '''Humpty Dumpty sat on a wall
Humpty Dumpty had a great fall
all the king's horses and all the king's men
couldn't put Humpty together again'''
def distance(row1, row2):
row1 = np.asarray(row1); row2 = np.asarray(row2)
difference = sum(abs(row1 - row2))
return difference
def main(text):
docs = [line.split() for line in text.splitlines()] # split text into lines and lists of words
N = len(docs)
vocabulary = list(set(text.split())) # create the vocabulary: the list of words that appear at least once
df = {}; tf = {}
for word in vocabulary:
tf[word] = [doc.count(word)/len(doc) for doc in docs]
df[word] = sum([word in doc for doc in docs])/N
full_tfidf = []
for doc_index, doc in enumerate(docs):
tfidf = []
for word in vocabulary:
tf_idf_val = tf[word][doc_index]*math.log(1/df[word],10)
tfidf.append(tf_idf_val)
full_tfidf.append(tfidf)
dist = [[distance(sent1, sent2) for sent1 in full_tfidf] for sent2 in full_tfidf]
all_of_them = np.asarray(dist).astype('float')
all_of_them[all_of_them==0] = np.nan
print(np.unravel_index(np.nanargmin(all_of_them), all_of_them.shape))
main(text)