-
Notifications
You must be signed in to change notification settings - Fork 11
/
pages_jaunes_maroc.py
172 lines (147 loc) · 5.83 KB
/
pages_jaunes_maroc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# -*- coding: utf-8 -*-
'''
Created on Mar 01, 2011
@author: Mourad Mourafiq
@copyright: Copyright © 2011
other contributers:
'''
from bs4 import BeautifulSoup
import cPickle as pickle
import re
import random
import time
import urllib
import urllib2
import urlparse
url_base = "http://www.pj.ma/pagesjaunes?"
class PJ(object):
"""
PJ object that executes queries and returns set of results
URL templates to make PJ searches.
http://www.pj.ma/pagesjaunes?page=2&pro_quiquoi=ophtalmologue&pro_ou=Casablanca
http://www.google.com/search?
page=page number
&pro_quiquoi= object of search
&pro_ou= location
"""
def __init__(self, pause=5.0, page=1, query="", location=""):
"""
@type pause: long
@param url: not to burden the server
@type page: int
@param page: pagination
@type query: str
@param query: the object of the search
@type location: str
@param location: where to look
@rtype: object
@return: the instance of PJ
"""
self.pause = pause
self.page = page
self.query = query
self.location = location
def set_pause(self, pause):
self.pause = pause
def set_page(self, page = 0):
self.page = next if page > 0 else self.page + 1
def get_page(self):
return self.page
def set_query(self, query):
self.query = query
def set_location(self, location):
self.location = location
def __url_contruction(self):
"""
Construct the search url
"""
url_search = url_base
#page
page = "page=%(page)s&" % {"page":self.page}
url_search += page
# pro_quiquoi
query = "pro_quiquoi=%(query)s&" % {"query":self.query}
url_search += query
# pro_ou
location = "pro_ou=%(location)s&" % {"location":self.location}
url_search += location
return url_search
# Returns a generator that yields URLs.
def search(self, file=None):
"""
Returns search results for the current query as a iterator.
"""
# pause, so as to not overburden PJ
#time.sleep(self.pause+(random.random()-0.5)*5)
# Prepare the URL of the first request.
url_search = self.__url_contruction()
print url_search
# Request the PJ Search results page.
stat = True
while stat:
try:
html = self.__get_result(url_search)
# Parse the response and extract the summaries
soup = BeautifulSoup(html)
if soup.findAll(text=re.compile("captcha")) != []:
print "Failed page "+self.get_page()+", captcha retrying"
else:
stat = False
except:
print "Failed page "+self.get_page()+", retrying"
time.sleep(4)
if soup.findAll(text=re.compile("cette recherche")) != []:
print soup.findAll(text=re.compile("cette recherche"))
return False
for table in soup.findAll("li", {"class": "gauchezonebcenter"}):
result = ""
try :
prof = ' '.join(re.findall('\w+', table.findNext("h2", {"class": "annoncesd-ttre"}).a.findNext(text=True)))
result += prof + ' | '
activity = ' '.join(re.findall('\w+', table.findNext("div", {"class": "annoncesd-Activite"}).span.findNextSiblings(text=True)[0]))
result += activity + ' | '
address_phone = table.findNext("li", {"class": "annoncesd-adressec"})
glo_address = address_phone.div.div
address = ' '.join(re.findall('\w+', glo_address.next.string))
result += address + ' | '
city = ' '.join(re.findall('\w+', glo_address.strong.string))
result += city + ' | '
phones = address_phone.span.findNextSibling('strong')
phone1 = ' '.join(re.findall('\w+', phones.string))
result += phone1 + ' | '
phone2 = ' '.join(re.findall('\w+', phones.findNextSibling('strong').string))
result += phone2 + ' | '
except :
pass
pickle.dump(result, file)
return True
# Request the given URL and return the response page, using the cookie jar.
def __get_result(self, url):
"""
Request the given URL and return the response page, using the cookie jar.
@type url: str
@param url: URL to retrieve.
@rtype: str
@return: Web page retrieved for the given URL.
@raise IOError: An exception is raised on error.
@raise urllib2.URLError: An exception is raised on error.
@raise urllib2.HTTPError: An exception is raised on error.
"""
request = urllib2.Request(url)
request.add_header('User-Agent',
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0)')
response = urllib2.urlopen(request)
html = response.read()
response.close()
return html
# When run as a script, take all arguments as a search query and run it.
if __name__ == "__main__":
prof = open("medecins.txt", "w")
query = 'medecin'
pj = PJ()
pj.set_query(query)
stat = True
while stat:
stat = pj.search(prof)
pj.set_page()
prof.close()