forked from cilame/any-whim
-
Notifications
You must be signed in to change notification settings - Fork 0
/
获取网站文本.py
36 lines (34 loc) · 1.08 KB
/
获取网站文本.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# -*- coding:utf-8 -*-
import requests
import re
from lxml import etree
def normal_content(content,
tags=['script','style','select','noscript'],
rootxpath='//html'):
# 通用文本提取函数
if type(content) is bytes:
try:
c = content.decode('utf-8')
except:
c = content.decode('gbk')
elif type(content) is str:
c = content
else:
raise 'content type must in [bytes, str].'
# 针对部分网页汉字粘连的问题的处理,增强鲁棒性。
c = re.sub('>([^>]*[\u4e00-\u9fa5]{1,}[^<]*)<','>\g<1> <',c)
e = etree.HTML(c)
q = []
for it in e.getiterator():
if it.tag in tags or type(it.tag) is not str:
q.append(it)
for it in q:
p = it.getparent()
if p is not None:
p.remove(it)
t = e.xpath('normalize-space({})'.format(rootxpath))
return t.strip()
url = 'http://www.pingan.com/official/insurance?secondclass=4617947d7ded73bd&flag=47b8b71a21e9494e'
s = requests.get(url)
t = normal_content(s.content)
print(t)