-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
194 lines (164 loc) · 6.59 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import time
import base64
import rsa
import math
import random
import binascii
import requests
import re
import warnings
from urllib.parse import quote_plus
from code_verification import code_verificate
from downloader import Downloader
from pageParser import pageParser
from recoder import Recoder
from Properities import Properities
# 构造 Request headers
agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0'
headers = {
'User-Agent': agent
}
recoder = Recoder()
session = requests.session()
warnings.filterwarnings('ignore')
p = Properities()
urls = p.urls
# urls = recoder.get_error_urls()
deal_error = False
timescopes = p.timescopes
# 访问 初始页面带上 cookie
index_url = "http://weibo.com/login.php"
yundama_username = ''
yundama_password = ''
verify_code_path = './pincode.png'
def get_pincode_url(pcid):
size = 0
url = "http://login.sina.com.cn/cgi/pin.php"
pincode_url = '{}?r={}&s={}&p={}'.format(url, math.floor(random.random() * 100000000), size, pcid)
return pincode_url
def get_img(url):
resp = requests.get(url, headers=headers, stream=True)
with open(verify_code_path, 'wb') as f:
for chunk in resp.iter_content(1000):
f.write(chunk)
def get_su(username):
"""
对 email 地址和手机号码 先 javascript 中 encodeURIComponent
对应 Python 3 中的是 urllib.parse.quote_plus
然后在 base64 加密后decode
"""
username_quote = quote_plus(username)
username_base64 = base64.b64encode(username_quote.encode("utf-8"))
return username_base64.decode("utf-8")
# 预登陆获得 servertime, nonce, pubkey, rsakv
def get_server_data(su):
pre_url = "http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su="
pre_url = pre_url + su + "&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.18)&_="
prelogin_url = pre_url + str(int(time.time() * 1000))
pre_data_res = session.get(prelogin_url, headers=headers)
sever_data = eval(pre_data_res.content.decode("utf-8").replace("sinaSSOController.preloginCallBack", ''))
return sever_data
# 这一段用户加密密码,需要参考加密文件
def get_password(password, servertime, nonce, pubkey):
rsaPublickey = int(pubkey, 16)
key = rsa.PublicKey(rsaPublickey, 65537) # 创建公钥,
message = str(servertime) + '\t' + str(nonce) + '\n' + str(password) # 拼接明文js加密文件中得到
message = message.encode("utf-8")
passwd = rsa.encrypt(message, key) # 加密
passwd = binascii.b2a_hex(passwd) # 将加密信息转换为16进制。
return passwd
def login(username, password):
# su 是加密后的用户名
su = get_su(username)
sever_data = get_server_data(su)
servertime = sever_data["servertime"]
nonce = sever_data['nonce']
rsakv = sever_data["rsakv"]
pubkey = sever_data["pubkey"]
password_secret = get_password(password, servertime, nonce, pubkey)
postdata = {
'entry': 'weibo',
'gateway': '1',
'from': '',
'savestate': '7',
'useticket': '1',
'pagerefer': "http://login.sina.com.cn/sso/logout.php?entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl",
'vsnf': '1',
'su': su,
'service': 'miniblog',
'servertime': servertime,
'nonce': nonce,
'pwencode': 'rsa2',
'rsakv': rsakv,
'sp': password_secret,
'sr': '1366*768',
'encoding': 'UTF-8',
'prelt': '115',
'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
'returntype': 'META'
}
need_pin = sever_data['showpin']
if need_pin == 1:
# 你也可以改为手动填写验证码
if not yundama_username:
raise Exception('由于本次登录需要验证码,请配置顶部位置云打码的用户名{}和及相关密码'.format(yundama_username))
pcid = sever_data['pcid']
postdata['pcid'] = pcid
img_url = get_pincode_url(pcid)
get_img(img_url)
verify_code = code_verificate(yundama_username, yundama_password, verify_code_path)
postdata['door'] = verify_code
login_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)'
login_page = session.post(login_url, data=postdata, headers=headers)
login_loop = (login_page.content.decode("GBK"))
pa = r'location\.replace\([\'"](.*?)[\'"]\)'
loop_url = re.findall(pa, login_loop)[0]
login_index = session.get(loop_url, headers=headers)
uuid = login_index.text
uuid_pa = r'"uniqueid":"(.*?)"'
uuid_res = re.findall(uuid_pa, uuid, re.S)[0]
web_weibo_url = "http://weibo.com/%s/profile?topnav=1&wvr=6&is_all=1" % uuid_res
weibo_page = session.get(web_weibo_url, headers=headers)
weibo_pa = r'<title>(.*?)</title>'
user_name = re.findall(weibo_pa, weibo_page.content.decode("utf-8", 'ignore'), re.S)[0]
print('登录成功,你的用户名为:'+user_name)
if __name__ == "__main__":
login(p.username, p.password)
n = 1
if deal_error:
for url in urls:
aim = url
try:
downloader = Downloader(aim, session)
page_parser = pageParser(downloader)
data = page_parser.info
data_tr = page_parser.trans_info
if page_parser.is_last == 1:
break
print(f"\t----已解析第{n}个界面")
status = recoder.save_weibo(data)
recoder.save_weibo(data_tr)
except Exception as e:
print('\t\t----本链接爬取失败,存入数据库')
recoder.save_url(aim)
print('\t\t----错误信息' + str(e))
n+=1
else:
for url, timescope in zip(urls, timescopes):
print('----当前时段:{0}'.format(timescope))
for i in range(1, 51):
aim = url.strip().replace('\n','')[:-1] + str(i)
try:
downloader = Downloader(aim, session)
page_parser = pageParser(downloader)
data = page_parser.info
data_tr = page_parser.trans_info
if page_parser.is_last == 1:
break
print(f"\t----已解析第{i}个界面")
status = recoder.save_weibo(data)
recoder.save_weibo(data_tr)
except Exception as e:
print('\t\t----本链接爬取失败,存入数据库')
recoder.save_url(aim)
n+=1