-
Notifications
You must be signed in to change notification settings - Fork 3
/
index.py
81 lines (68 loc) · 2.71 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import gradio as gr
from xhs import XhsClient
from time import sleep
import pandas as pd
import time
import os
# 45 个为一轮,每轮间隔默认 3 分钟
ROUND_COUNT = 45
def xhs_to_excel(cookie, keyword, max_count, handle_delay=2, round_delay=180):
xhs_client = XhsClient(cookie)
notes = []
page = 1
while True:
data = xhs_client.get_note_by_keyword(keyword, page=page)
has_more = data.get('has_more')
print("has_more: %s" % has_more)
if (not has_more) or len(notes) >= max_count:
break
for item in data.get('items'):
id = item.get('id')
note_card = item.get('note_card')
display_title = note_card.get('display_title')
note_info = xhs_client.get_note_by_id(id)
desc = note_info.get('desc')
interact_info = note_info.get('interact_info')
timestamp = note_info.get('time')
notes.append({
'url': 'https://www.xiaohongshu.com/explore/' + id,
'display_title': display_title,
'desc': desc,
'collected_count': interact_info.get('collected_count'),
"liked_count": interact_info.get("liked_count"),
'time': time.strftime('%Y-%m-%d', time.localtime(timestamp / 1000))
})
cur_time = time.strftime(
'%H:%M:%S', time.localtime(time.time()))
print("notes: %s/%s [%s]" % (len(notes), int(max_count), cur_time))
if len(notes) % ROUND_COUNT == 0:
print("round_delay: %s" % round_delay)
sleep(round_delay)
else:
sleep(handle_delay)
if len(notes) >= max_count:
break
page += 1
df = pd.DataFrame(notes)
dir = os.path.dirname(os.path.abspath(__file__))
xlsx = dir + '/notes'
if not os.path.exists(xlsx):
os.makedirs(xlsx)
df.to_excel('%s/notes.%s.xlsx' %
(xlsx, time.strftime('%Y%m%d.%H%M%S', time.localtime(time.time()))), index=False)
print('notes to excel done!\n')
def greet(cookie, keyword, max_count, handle_delay, round_delay):
xhs_to_excel(cookie, keyword, max_count, handle_delay, round_delay)
if __name__ == '__main__':
cookie = gr.Textbox(label="Cookie")
keyword = gr.Textbox(label="搜索关键词")
max_count = gr.Number(label="爬虫数量")
handle_delay = gr.Number(label="操作时间间隔,单位秒", value=2)
round_delay = gr.Number(label="每轮时间间隔,单位秒", value=180)
app = gr.Interface(
fn=greet,
inputs=[cookie, keyword, max_count, handle_delay, round_delay],
outputs=[],
title='Xhs爬虫',
)
app.launch()