-
Notifications
You must be signed in to change notification settings - Fork 0
/
details_extract.py
61 lines (50 loc) · 2.57 KB
/
details_extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import scrapy
from collections import OrderedDict
class DetailsExtractSpider(scrapy.Spider):
name = 'details_extract'
allowed_domains = ['behindthename.com']
start_urls = ['http://www.behindthename.com/submit/names/usage/eastern-african/']
countries_name = None
country_count = 0
def parse(self, response):
"""
Getting all the countries URL,
And redirecting each country to 'each_page' method to confirm the pages,
And then scrap them in 'page_details' method.
"""
# The class is being change several times [nb-quickfilter | nb2-quickfilter]
all_countries_page = response.css('div.nb-quickfilter > select[name=usage] > option')
# If all_countries_page is [], try other method
if not all_countries_page:
all_countries_page = response.css('div.nb2-quickfilter > select[name=usage] > option')
DetailsExtractSpider.countries_name = all_countries_page.css(' ::text').extract()[2:] # This contains
# all the countries name
all_country = all_countries_page.css(' ::attr(value)').extract()[2:]
for country in all_country:
yield scrapy.Request(url=response.urljoin(country), callback=self.each_page)
def each_page(self, response):
"""
Confirming the number of pages and also redirecting them to be scrapped.
"""
links = list(OrderedDict.fromkeys(response.css('nav.pagination > a::attr(href)').extract()).keys())
for link in links:
yield response.follow(url=response.urljoin(link), callback=self.page_details)
def page_details(self, response):
"""
Getting all the information needed from all the pages here.
"""
general = response.css("div.browsename")
country_name = DetailsExtractSpider.countries_name[DetailsExtractSpider.country_count]
for gen in general:
using = gen.xpath('.//span//text()').extract()
name = using[0]
gender = using[1]
all_text = ''.join(gen.xpath('.//text()').extract())
yield {
"Name": name,
"Gender": gender,
"Countries": country_name,
"Location": ''.join(gen.css("span.listusage ::text").extract()),
"Description": all_text[all_text.index(using[-1])+len(using[-1]):]
}
DetailsExtractSpider.country_count += 1