-
Notifications
You must be signed in to change notification settings - Fork 21
/
zhihu.py
154 lines (132 loc) · 5.05 KB
/
zhihu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# -*- coding:utf-8 -*-
import requests
import re
import sys
import json
import functools
import os
import sys
from bs4 import BeautifulSoup
from ConfigParser import ConfigParser
reload(sys)
sys.setdefaultencoding('utf8')
def mysession():
conf = ConfigParser()
conf.read('conf.ini')
cookies = dict(conf._sections['cookies'])
header = dict(conf._sections['header'])
post_data = dict(conf._sections['account'])
session = requests.Session()
r = session.post('http://www.zhihu.com/login', data = post_data, headers=header)
return session
class MyZhihu(object):
def __init__(self, session=None):
self.zhi = 'http://www.zhihu.com'
if session == None:
raise "Exception: session is None"
else:
self.session = session
#我的提问
def ask(self):
html = self.session.get(self.zhi + '/people/Grapher/asks').text
soup = BeautifulSoup(html)
ask_gen = ((self.zhi + str(ask.get('href')) + ' ' + str(ask.get_text().encode('utf-8'))) \
for ask in soup.find_all('a', 'question_link'))
for ask in ask_gen:
print ask
'''def inbox(self):
html = self.session.get('http://www.zhihu.com/inbox').text
soup = BeautifulSoup(html)
for x in soup.find_all('div', 'zm-pm-item-main')
print x.get(a) '''
#话题
def topic(self):
html = self.session.get(self.zhi + '/people/Grapher/topics').text
soup = BeautifulSoup(html)
topic_gen = (topic.strong.get_text() for topic in soup.find_all('div', 'zm-profile-section-main'))
for t in topic_gen:
print t
#专栏
def column(self):
html = self.session.get(self.zhi + '/people/Grapher/columns/followed').text
soup = BeautifulSoup(html)
column_gen = (column.strong.get_text() for column in soup.find_all('div', 'zm-profile-section-main'))
for c in column_gen:
print c
class Question(object):
def __init__(self, qurl=None, session=None):
if qurl == None or session ==None:
raise 'Your question-url or session is None'
else:
html = session.get(qurl).text
self.soup = BeautifulSoup(html)
def title(self):
thetitle = self.soup.find('h2', 'zm-item-title zm-editable-content')
return thetitle.get_text().strip()
def question_detail(self):
question_detail = self.soup.find('div', 'zm-editable-content')
s = question_detail.get_text()
d = re.sub('<>','',s)
return d.strip()
def an(self):
ans = self.soup.h3['data-num']
return ans.strip()
def answer_number(self):
answer_number = self.an()
return answer_number + '-人回答'
def follower(self):
follower = self.soup.find('div', 'zg-gray-normal')
return '-'.join(follower.get_text().split())
def logging(log):
if callable(log):
F = log
@functools.wraps(F)
def internal(self):
return F(self)
return internal
else:
def wrapper(F):
@functools.wraps(F)
def internal(self):
print log
return F(self)
return internal
return wrapper
@logging('生成中..............')
def all_answer(self):
answer_number = self.an()
reg_img = r'http://pic3.zhimg.com/\w+\.jpg'
if answer_number == 0:
print "This question is no answers"
else:
re_br = re.compile(r'<br/?>')
re_allmark = re.compile(r'<[^>]+>',re.S)
all_answer_list = self.soup.find_all( 'div', 'zm-item-answer ') #所有回答列表
for each in all_answer_list:
yield each.h3.get_text() #答主
content = str(each.find('div', ' zm-editable-content clearfix'))
br2n = re.sub(re_br, '\n', content)
for s in br2n.split('\n'):
yield re.sub(re_allmark, '', s)
def save(self, answers, path, pattern):
title = self.title()
question_detail = self.question_detail()
answer_number = self.answer_number()
followers = self.follower()
all_answers = answers
filename = '/' + title.strip() + '.txt'
if sys.platform == 'linux2':
if os.path.exists(os.path.split(path)[0]):
if not os.path.split(path)[1]:
destpath = os.path.split(path)[0] + filename
else:
destpath = path
else:
raise 'You path is not exists'
else:
raise 'Other systems such as Windows are not currently supported'
with open(destpath, pattern) as f:
f.write('标题:%s \n' % title + '问题详情:%s \n' % question_detail + '回答数:%s \t' % answer_number \
+ ' 关注数:%s \n' % followers)
for each in all_answers:
f.write(each + '\n')