-
Notifications
You must be signed in to change notification settings - Fork 0
/
tools.py
105 lines (83 loc) · 3.08 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"Create a wordcloud visualization"
import collections
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import TweetTokenizer
import numpy as np
from PIL import Image
import wordcloud
import re
from bs4 import BeautifulSoup
def wordcloud_visualization(words, colormap, mask, carving_color):
# Load the image and convert them to the right format
colormap = np.asarray(colormap.convert("RGB"))
mask = np.asarray(mask.convert("L"))
# Create usefull masks from the mask image
border_mask = mask == 255
inner_mask = mask == 0
# Mask for the word cloud
wc_mask_in = np.array(mask)
wc_mask_in[~inner_mask] = 255
options = {
"random_state": 0,
"mask": wc_mask_in,
"mode": "RGB",
"background_color": "white",
"color_func": lambda *arg, **kwarg: "black",
}
# Create the word cloud
word_cloud = wordcloud.WordCloud(**options)
word_cloud.fit_words(words)
# Convert the array to a grey scale image
wc_array = np.asarray(Image.fromarray(word_cloud.to_array()).convert("L"))
img = np.array(colormap)
# Create a normalized mask from the word cloud
wc_normalized = np.expand_dims((255 - wc_array) / 255, -1)
# Linear interpolation between the text and the backgroud color
img = (1 - wc_normalized) * carving_color + wc_normalized * colormap
# Convert the image back to int
img = img.astype(np.uint8)
# Add the border
img[border_mask] = colormap[border_mask]
return Image.fromarray(img)
def parse_vtt(vtt_file):
text = open(vtt_file, "rb").read().decode("utf8")
text = text.replace("WEBVTT\n", "")
text = text.replace(r"\h", "")
text = text.replace(r"<b>m 0 0 l 100 0 100 100 0 100</b>", "")
timecode = r"\d+:\d+\.\d+"
pattern = fr"\n{timecode} --> {timecode}\n"
text = re.sub(pattern, "", text, flags=re.S)
soup = BeautifulSoup(text, "html.parser")
text = ' '.join(soup.stripped_strings)
return text
def tokenize(text):
tokenizer = TweetTokenizer()
words = tokenizer.tokenize(text)
return words
def words_processing(words, stop_words, kept):
# Keep only alpha-numerical characters
words = [w.lower() for w in words if w.isalnum()]
# Remove stop words
words = [w for w in words if w not in stop_words]
words_count = stem_count(words)
# Count the top words
counter = collections.Counter(words_count)
words = dict(counter.most_common(kept))
return words
def stem_count(words):
# stem all the words
stemmer = PorterStemmer(PorterStemmer.NLTK_EXTENSIONS)
words_by_stem = collections.defaultdict(list)
for word in words:
stem = stemmer.stem(word)
words_by_stem[stem].append(word)
# count the words that have the same stem
# use the most common usage of the stem as the key
words_count = {}
for stem, words in words_by_stem.items():
counter = collections.Counter(words)
most_common_usage = counter.most_common(1)
total = sum(dict(counter).values())
words_count[most_common_usage[0][0]] = total
return words_count