-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
35 lines (31 loc) · 1.63 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!/usr/bin/python
FIELDS = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')
import re
def re_scraper(html):
results = {}
for field in FIELDS:
#results[field] = re.search('<tr id="places_%s__row".*?<td class="w2p_fw">(.*?)</td>' % field, html).groups()[0]
results[field] = re.search('<tr id="places_{}__row".*?<td class="w2p_fw">(.*?)</td>'.format(field), html).groups()[0]
return results
from bs4 import BeautifulSoup
def bs_scraper(html):
soup = BeautifulSoup(html, 'html.parser')
results = {}
for field in FIELDS:
#results[field] = soup.find('table').find('tr', id='places_%s__row' % field).find('tr', class_='w2p_fw').text
#results[field] = soup.find('table').find('tr', id='places_{}__row'.format(field)).find('td', class_='w2p_fw').text
#results[field] = soup.find('table').find('tr', id='places_{}__row'.format(field)).find ('td', class_='w2p_fw').text
tb = soup.find('table')
tr = tb.find('tr', id='places_{}__row'.format(field))
#td = tr.find('td', class_='w2p_fw')#fault!!!
td = tr.find(attrs={'class':'w2p_fw'})#fault!!!
results[field] = td.text
return results
import lxml.html
def lxml_scraper(html):
tree = lxml.html.fromstring(html)
results = {}
for field in FIELDS:
#results[field] = tree.cssselect('table > tr#places_%s__row > td.w2p_fw' % field)[0].text_content()
results[field] = tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content()
return results