-
Notifications
You must be signed in to change notification settings - Fork 15
/
patu.py
executable file
·218 lines (199 loc) · 7.96 KB
/
patu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#!/usr/bin/env python
import httplib2
import sys
from lxml.html import fromstring
from optparse import OptionParser
from multiprocessing import Process, Queue
from urlparse import urlsplit, urljoin, urlunsplit
class Spinner(object):
def __init__(self):
self.status = 0
self.locations = ['|', '/', '-', '\\']
def spin(self):
sys.stderr.write("%s\r" % self.locations[self.status])
sys.stderr.flush()
self.status = (self.status + 1) % 4
class Response(object):
def __init__(self, url, status_code=-1, content=None, links=[]):
self.url = url
self.status_code = status_code
self.content = content
self.links = links
class Patu(object):
def __init__(self, urls=[], spiders=1, spinner=True, verbose=False, depth=-1, input_file=None, generate=False):
# Set up the multiprocessing bits
self.processes = []
self.task_queue = Queue()
self.done_queue = Queue()
self.next_urls = {}
self.queued_urls = {}
self.seen_urls = set()
self.spinner = Spinner()
# Generate the initial URLs, either from command-line, stdin, or file
if input_file:
if input_file == '-':
f = sys.stdin
else:
f = open(input_file)
for line in f:
bits = line.strip().split("\t")
if bits == ['']:
continue
elif len(bits) == 1:
self.next_urls[bits[0]] = None
else:
self.next_urls[bits[0]] = bits[1]
f.close()
else:
self.urls = []
h = httplib2.Http(timeout = 60)
for url in urls:
if not url.startswith("http://"):
url = "http://" + url
# Follow initial redirects here to set self.constraints
try:
resp, content = h.request(url)
url = resp['content-location']
except:
# This URL is no good. Keep it in the queue to show the
# error later
pass
self.urls.append(url)
self.next_urls[url] = None
self.constraints = [''] + [urlsplit(url).netloc for url in self.urls]
self.spiders = spiders
self.show_spinner = spinner
self.verbose = verbose
self.depth = depth
self.input_file = input_file
self.generate = generate
def worker(self):
"""
Function run by worker processes
"""
try:
h = httplib2.Http(timeout = 60)
for url in iter(self.task_queue.get, 'STOP'):
result = self.get_urls(h, url)
self.done_queue.put(result)
except KeyboardInterrupt:
self.done_queue.put(Response(url, -1))
def get_urls(self, h, url):
"""
Function used to calculate result
"""
links = []
try:
resp, content = h.request(url)
if self.input_file:
# Short-circuit if we got our list of links from a file
return Response(url, resp.status)
elif resp.status != 200:
return Response(url, resp.status)
elif urlsplit(resp['content-location']).netloc not in self.constraints:
# httplib2 follows redirects automatically
# Check to make sure we've not been redirected off-site
return Response(url, resp.status)
else:
html = fromstring(content)
except Exception, e:
print "%s %s" % (type(e), str(e))
return Response(url)
# Add relevant links
for link in html.cssselect('a'):
if not link.attrib.has_key('href'):
# Skip links w/o an href attrib
continue
href = link.attrib['href']
absolute_url = urljoin(resp['content-location'], href.strip())
parts = urlsplit(absolute_url)
if parts.netloc in self.constraints and parts.scheme == 'http':
# Ignore the #foo at the end of the url
no_fragment = parts[:4] + ('',)
links.append(urlunsplit(no_fragment))
return Response(url, resp.status, content, links)
def process_next_url(self):
response = self.done_queue.get()
referer = self.queued_urls[response.url]
result = '[%s] %s (from %s)' % (response.status_code, response.url, referer)
if response.status_code == 200:
if self.verbose:
print result
sys.stdout.flush()
elif self.generate:
print "%s\t%s" % (response.url, referer)
elif self.show_spinner:
self.spinner.spin()
else:
print result
sys.stdout.flush()
self.seen_urls.add(response.url)
del(self.queued_urls[response.url])
for link in response.links:
if link not in self.seen_urls and link not in self.queued_urls:
# remember what url referenced this link
self.next_urls[link] = response.url
def crawl(self):
# For the next level
current_depth = 0
try:
# Start worker processes
for i in range(self.spiders):
p = Process(target=self.worker)
p.start()
self.processes.append(p)
while len(self.next_urls) > 0 and (current_depth <= self.depth or self.depth == -1):
if self.verbose:
print "Starting link depth %s" % current_depth
sys.stdout.flush()
# place next urls into the task queue, possibly
# short-circuiting if we're generating them
for url, referer in self.next_urls.iteritems():
self.queued_urls[url] = referer
if self.generate and current_depth == self.depth:
self.done_queue.put(Response(url, 200))
else:
self.task_queue.put(url)
self.next_urls = {}
while len(self.queued_urls) > 0:
self.process_next_url()
current_depth += 1
except KeyboardInterrupt:
pass
finally:
# Give the spiders a chance to exit cleanly
for i in range(self.spiders):
self.task_queue.put('STOP')
for p in self.processes:
# Forcefully close the spiders
p.terminate()
p.join()
def main():
parser = OptionParser()
options_a = [
["-s", "--spiders", dict(dest="spiders", type="int", default=1, help="sends more than one spider")],
["-S", "--nospinner", dict(dest="spinner", action="store_false", default=True, help="turns off the spinner")],
["-v", "--verbose", dict(dest="verbose", action="store_true", default=False, help="outputs every request (implies --nospiner)")],
["-d", "--depth", dict(dest="depth", type="int", default=-1, help="does a breadth-first crawl, stopping after DEPTH levels")],
['-g', '--generate', dict(dest='generate', action='store_true', default=False, help='generate a list of crawled URLs on stdout')],
['-i', '--input', dict(dest='input_file', type='str', default='', help='file of URLs to crawl')],
]
for s, l, k in options_a:
parser.add_option(s, l, **k)
(options, args) = parser.parse_args()
# Submit first url
urls = [unicode(url) for url in args]
kwargs = {
'urls': urls,
'spiders': options.spiders,
'spinner': options.spinner,
'verbose': options.verbose,
'depth': options.depth,
'generate': options.generate,
'input_file': options.input_file
}
spider = Patu(**kwargs)
spider.crawl()
print
if __name__ == '__main__':
sys.exit(main())