-
Notifications
You must be signed in to change notification settings - Fork 1
/
analyz.py
198 lines (165 loc) · 5.75 KB
/
analyz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import pandas as pd
import numpy as np
import emoji
from wordcloud import WordCloud
ONLY_ASCII = False
def cleanData(fileName, ASCII=False):
'''
Return: Pesan dalam file text whatsapp yang telah di bersihkan dan
dikelompokkan berdasarkan kolom
Parameter:
filename -> String berupa nama file text whatsapp
ASCII -> Bool, apakah spesial karakter dihapus?
'''
global ONLY_ASCII
ONLY_ASCII = ASCII
# Menyiapkan data text yang siap untuk masuk tahap Cleaning
chatFile = open(fileName, 'r', encoding='utf-8')
fileLines = chatFile.readlines()
chatdf = pd.DataFrame(prepData(fileLines), columns=['date', 'author', 'messages'])
# Mengubah titik menjadi titik dua agar dapat dibaca sebagai jam:menit
def addColon(datetime):
date = datetime[:8]
hours = ''.join(datetime[9:14].split('.'))
return f'{date} {hours}'
chatdf['date'] = chatdf['date'].apply(addColon)
# Mengubah kolom date menjadi tipe data waktu / datetime
chatdf['date'] = pd.to_datetime(chatdf['date'], errors='coerce')
chatdf.dropna(inplace=True)
if ASCII:
chatdf['messages'] = chatdf['messages'].str.replace('[^a-zA-Z#]', ' ')
chatdf['messages'] = formaledWord(chatdf['messages'])
return chatdf
def formaledWord(messages):
'''
Return: Pesan yang telah sedikit dihilangkan kata-kata singkat
Parameter: messages -> Pandas Series
'''
dictClean = pd.read_csv('https://raw.githubusercontent.com/nasalsabila/kamus-alay/master/colloquial-indonesian-lexicon.csv',
usecols=['slang', 'formal'])
dictClean = dict(zip(dictClean['slang'], dictClean['formal']))
cleaner = lambda s: ' '.join([dictClean[w.lower()] if w.lower() in dictClean else w.lower() for w in s.split()])
messages = messages.apply(cleaner)
return messages
def prepData(fileLines):
'''
Return: Sekumpulan baris dan kolom text yang telah di pisah-pisahkan
Parameter: fileLines -> list
'''
chatTable = []
for chat in fileLines:
date = chat[:14]
mess = chat[17:]
if ':' in mess:
sep = mess.index(':')
else:
chatTable.append([date, 'Whatsapp', mess.rstrip('\n')])
continue
author = mess[:sep]
mess = mess[sep+2:].rstrip('\n')
chatTable.append([date, author, mess])
return chatTable
def trackChangePhone(data, phone):
'''
Return: Hasil rekam setiap jejak perubahan nomor telepon
Parameter:
data -> DataFrame pesan whatsapp
phone -> list, berisi kumpulan nomor telepon yang ingin di cari jejaknya
'''
# Menyimpan baris dataframe yang berisi pesan oleh Whatsapp terkait
# perubahan nomor telepon
phone = [_.strip('\u200e') for _ in phone]
keyWord = ' telah diganti ke '
mess = data[(data['author'] == 'Whatsapp') & (data['messages'].str.contains(keyWord))]
mess = mess['messages'].str.split(keyWord)
track = dict.fromkeys(phone, set())
for m in mess:
m = [_.strip(u'\u200e') for _ in m]
for n in phone:
if n in m:
track[n].update(m)
return track
def mainGet(data, keyword='', regex=True):
'''
Return: Dataframe yang kolom 'messages' mengandung keyword
Parameter:
data -> Pandas DataFrame
keyword -> String, kata kunci yang harus muncul pada setiap baris
pada kolom 'messages'
'''
data = data[data['messages'].str.contains(keyword, regex=regex)]
return data
def getMedia(data):
return mainGet(data, 'Media tidak disertakan')
def getQuestion(data):
if ONLY_ASCII:
print('Cannot Found! Because the messages is only ASCII.')
else:
return mainGet(data, '?', regex=False)
def getTag(data):
if ONLY_ASCII:
print('Cannot Found! Because the messages is only ASCII.')
else:
return mainGet(data, '@62')
def getLink(data):
if ONLY_ASCII:
print('Cannot Found! Because the messages is only ASCII.')
else:
return mainGet(data, r'(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+')
def getDelMessage(data):
return mainGet(data, 'pesan ini telah dihapus')
def getProfile(data, people):
'''
Return: dict, Berupa kumpulan profil lengkap seseorang dalam grup
Parameter:
data -> Pandas DataFrame
people -> str, orang yang akan di analisis
'''
df = data[data['author'] == people]
rank = data['author'].value_counts().reset_index()
profile = {'Name': people, 'ActiveDate': pd.to_datetime(df['date'].iloc[[0, -1]].values),
'MessageSent': df, 'MessageDel': getDelMessage(df),
'LinkSent': getLink(df), 'MessageTag': getTag(df),
'MessageAsk': getQuestion(df), 'MediaSent': getMedia(df),
'EmojiUsed': getEmojiCount(df),
'ActiveRank': rank[rank['index'] == people].index[0] + 1}
return profile
def getEmojiCount(data):
'''
Return: Pandas DataFrame, berisi banyaknya emoji yang digunakan
setiap orang
Parameter: data -> Pandas DataFrame
'''
if ONLY_ASCII:
print(f'Cannot Found! Because the messages is only ASCII.')
else:
emojis = [each for mess in data['messages']
for each in mess
if each in emoji.UNICODE_EMOJI_ENGLISH]
sr = pd.Series(emojis, name='Emoji').value_counts()
return sr
def getTimeSeriesMessage(data, time):
'''
Return: Pandas Series, berisi banyaknya pesan berdasarkan waktu
Parameter:
data -> Pandas DataFrame
time -> list, tipe waktu yang menjadi dasar (hanya 1)
['year', 'month', 'day', 'hour']
'''
dt = lambda y: getattr(data['date'].dt, y)
base = [dt(b) for b in time]
df = data.groupby(by=base)['messages'].count()
return df
def getWordCloud(messages):
'''
Return: Objek berupa gambar
Parameter:
messages -> Pandas Series, berisi pesan-pesan
'''
text = ' '.join(sentence for sentence in messages)
wordcloud = WordCloud(collocations=False, max_words=2000, max_font_size=200,
random_state=42, width=758, height=512, colormap = 'YlGn')
wordcloud.generate(text)
return wordcloud.to_image()
if __name__ == '__main__':
pass