-
Notifications
You must be signed in to change notification settings - Fork 1
/
analyse_tweets.py
51 lines (40 loc) · 1.36 KB
/
analyse_tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import shelve
import re
'''
Opens the python shelf created by load_tweets.py
and removes the money data from each tweet, verifying
that each tweet is not a RT, contains 'pay $x' in it,
and is not a repeated data point. Amounts over $50 are
ignored, because some tweets contain statements like
'I would pay $100000'.
RTs are ignored because we want each person's individual
opinion.
'''
money_re = re.compile('pay \$([^ ]*) ')
def get_average(tweet_dict):
return sum(tweet_dict.itervalues())/len(tweet_dict)
analysis_dict = {}
shelf = shelve.open('hbotweets.dat')
for tweet in shelf['tweets']:
# check the tweet is not a RT, and contains 'pay $'
if 'RT' not in tweet['text'] and 'pay $' in tweet['text']:
money_text = money_re.findall(tweet['text'])
# try to extract the money amount, but discard
# any failures, since some tweets contain
# statements like 'I would pay $$$$'
try:
# we read the first money amount in the
# tweet; there should only be one.
money = float(money_text[0])
# disregard any money amounts > $50
if money < 50:
# a dictionary is used to store the tweets
# with the tweet_id as a key, avoiding any
# duplicates in the search/loading process
analysis_dict[tweet['id']] = money
except ValueError:
continue
print 'Data points:'
print len(analysis_dict)
print 'Average:'
print get_average(analysis_dict)