-
Notifications
You must be signed in to change notification settings - Fork 0
/
twit_crawl.py
150 lines (131 loc) · 3.59 KB
/
twit_crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# -*- coding: utf-8 -*-
# TODO: Read extend tweet with link which isn't rt and quoted
import sys, os
import codecs
import time as t
# sys.stdout = codecs.getwriter('utf8')(sys.stdout)
# sys.stderr = codecs.getwriter('utf8')(sys.stderr)
import configparser
import io
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import signal
import sys
import pathlib
import json
import pprint
import codecs
config = configparser.ConfigParser()
config.read('account.ini')
access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']
consumer_key = config['twitter']['consumer_key']
consumer_secret = config['twitter']['consumer_secret']
ftime = lambda: t.strftime('%Y_%m_%d')
now = lambda: t.strftime('%Y/%m/%d-%H:%M:%S')
pathlib.Path('txt').mkdir(parents=True, exist_ok=True)
fileName = 'txt/twit_{}.txt'
errorFileName = 'error.txt'
file = codecs.open(fileName.format(ftime()), 'a', 'utf-8-sig')
errorFile = codecs.open(errorFileName, 'a', 'utf-8-sig')
baseTime = ftime()
def writeToFile(txt):
global baseTime
global file
newTime = ftime()
if baseTime != newTime:
file.close()
file = codecs.open(fileName.format(newTime), 'a', 'utf-8-sig')
baseTime = newTime
file.write(txt)
file.write('\n[{}],\n'.format(now()))
def writeError(error, sleepTime = None):
if sleepTime is None:
sleepTime = 5
global errorFile
errorFile.write('[ERR] Time:{} Text:{}'.format(t.strftime('%Y/%m/%d-%H:%M:%S'), error))
errorFile.write('\n')
t.sleep(sleepTime)
def closeFile():
print('Closing all opened files')
global file
global errorFile
if file and errorFile:
file.close()
errorFile.close()
def signal_handler(signal, frame):
print('Exit...')
global file
global errorFile
global KB
if file:
file.close()
errorFile.close()
print('File Closed')
KB = True
sys.exit(0)
signal.signal(signal.SIGINT, signal_handler)
pp = pprint.PrettyPrinter()
def _print(txt):
pp.pprint(txt)
def print_format(user, text):
print('@%s: %s' % (user, text))
class StdOutListener(StreamListener):
def on_data(self, data):
txt = json.loads(data)
try:
tmp = txt
res = '@%s: %s' % (txt['user']['screen_name'], txt['text'])
txt = txt['text']
if txt[-1] == u'…':
pass
elif txt[:2] == 'RT':
pass
else:
if txt.find(u'…') != -1:
try:
txt = tmp['extended_tweet']['full_text']
except Exception as e:
# Post from external service, can't retrieve full text
print('Error:', e)
# writeError(e, 0)
# print(tmp.keys())
# _print(tmp)
print_format(tmp['user']['screen_name'], txt)
writeToFile(txt)
print('-'*30)
except Exception as e:
print('Error:', e)
print_format('USER_TWIT_ERROR', txt)
print('-'*30)
# writeError(e, 0)
# _print(tmp)
return True
def on_error(self, err):
print('[ERR] in StdOutListener with error', err)
writeError(err, 0)
return False
if __name__ == '__main__':
words = []
with open('words-th.txt') as f:
words = f.readlines()
words = map(lambda word: word[:-1] ,words)
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
l = StdOutListener()
KB = False
try:
print('Start streaming connection')
stream = Stream(auth, l)
stream.filter(languages=['th'], track=words, stall_warnings= True)
except Exception as e:
print('[ERR] Error in while loop')
writeError(e)
finally:
if not KB:
print('Restart Program')
writeError('Restart Program due to connection error', 0)
closeFile()
t.sleep(10)
os.execv(sys.executable, ['/usr/bin/python3.6'] + sys.argv)