-
Notifications
You must be signed in to change notification settings - Fork 1
/
test.py
82 lines (79 loc) · 2.44 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import urllib
import urllib.request
import re
from collections import deque
from bs4 import BeautifulSoup
import os
def saveFile(data, fname):
try:
fobj = open(fname, 'w')
fobj.write(data)
fobj.close()
except:
return
#main
htmlpath = './pages/html/'
txtpath = './pages/txt/'
os.popen('rm -rf ' + htmlpath + '* ' + txtpath + '*')
mp = {}#map for title to url
queueurl = deque()#queue for BFS
queueurl.append('http://www.yahoo.com')
cnt = 0
visited = set()#set for visited url, to avoid duplicated visited
visited |= {'http://www.yahoo.com'}
while queueurl:
if cnt > 99:
break
url = queueurl.popleft()
#title = queuetitle.popleft()
print(cnt, 'opening --->', url)
try:
data = urllib.request.urlopen(url, timeout=2).read().decode('UTF-8')
soup = BeautifulSoup(data, 'lxml')
title = soup.title.string
except:
continue
urlList = re.findall('\"(http://.+?)\".+?title', data)
#save html
cnt = cnt + 1
print(cnt, ' save---> ', title, url)
#path could be changed
htmlfnm = str(cnt) + '-' + title + '.html'
saveFile(data, htmlpath + htmlfnm)
mp[title] = url
#handle html to avoid OneTwoThreeblablabla.....Explicitly...to avoid no space between words
soup.title.append('\n')
soup.title.insert_before(' ')
scriptList = soup.find_all('script')
styleList = soup.find_all('style')
spanList = soup.find_all('span')
aList = soup.find_all('a')
iconList = soup.find_all(class_ = 'Icon')
for scrptElement in scriptList:
scrptElement.extract()
for stlElement in styleList:
stlElement.extract()
for spanElement in spanList:
spanElement.append(' ')
for aElement in aList:
aElement.append(' ')
for iconElement in iconList:
iconElement.extract()
text = soup.get_text()
txtfnm = str(cnt) + '-' + title +'.txt'
saveFile(text, txtpath + txtfnm)
#handle end
for item in urlList:
url = str(item)
reForTitle = '\"' + url + '\".+?title=\"(.*?)\"'
try:
titlelst = re.findall(reForTitle, data)
except:
continue
url.replace('/', '')#handle some bad url like "http:/\/\blabla..."the fucking '/' in url
for item in titlelst:
if str(item) is not '' and url not in visited:
visited |= {url}
queueurl.append(url)
print('Enqueue--->', str(item), url)
#end main