From 6061752d648f57cacc20dbe944a83736650003d8 Mon Sep 17 00:00:00 2001 From: Nathan Workman Date: Mon, 2 Apr 2018 00:56:49 -0400 Subject: [PATCH 1/3] Indeed mvp --- seeker/scraper/spiders/indeed.py | 60 ++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 seeker/scraper/spiders/indeed.py diff --git a/seeker/scraper/spiders/indeed.py b/seeker/scraper/spiders/indeed.py new file mode 100644 index 0000000..73d9672 --- /dev/null +++ b/seeker/scraper/spiders/indeed.py @@ -0,0 +1,60 @@ +import scrapy + +from scrapy.spiders import Spider +from scrapy.selector import Selector + +from scraper.items import JobItem + +from django.utils import timezone + + +class IndeedSpider(Spider): + name = "indeed" + allowed_domains = ["indeed.com"] + # start_urls = ["https://www.indeed.com/q-Django-l-remote-jobs.html"] + start_urls = [ + "https://www.indeed.com/jobs?q=django&l=Remote&limit=50", + ] + handle_httpstatus_list = [301, 302] + + # def start_requests(self): + # # location = "" + # # distance = "" + # # search_terms = "django" + # # search_query = "q=" + # base_url = "https://www.indeed.com/jobs?q=Django&l=remote" + # start_urls = [] + # start_urls.append(base_url) + + # return [scrapy.http.Request(url=start_url) for start_url in start_urls] + + def parse(self, response): + # self.log('\n Crawling %s\n' % response.url) + hxs = Selector(response) + jobs = hxs.xpath('//td[@id="resultsCol"]/div') + items = [] + for job in jobs: + item = JobItem() + + item['title'] = job.xpath('h2/a/@title/text()').extract() + item['url'] = job.xpath('h2/a').extract() + item['location'] = str('n/a') + + # item['location'] = job.xpath('span[@class="location"]/span/text()').extract() + + # Not all entries have a company + if job.xpath("span[@class='company']/text()").extract() == []:item['company'] = [u''] + else: + item['company'] = job.xpath("span[@class='company']/text()").extract() + item["email"] = str('n/a') + item['body'] = job.xpath( + "table/tr/td/span[@class='summary']").extract() + item['salary'] = job.xpath( + "table/tr/td/span[@class='source']/text()").extract() + item['pub_date'] = job.xpath( + "table/tr/td/span[@class='date']/text()").extract() + item["scrape_date"] = timezone.now() + item["job_board"] = "Indeed" + item["board_url"] = "www.indeed.com" + items.append(item) + return items From a44e48b844439445bfc342aca3c9066281cd5f5a Mon Sep 17 00:00:00 2001 From: Nathan Workman Date: Tue, 3 Apr 2018 00:28:31 -0400 Subject: [PATCH 2/3] Use Indeed RSS feed to scrape job listing --- seeker/scraper/spiders/indeed.py | 77 +++++++++----------------------- 1 file changed, 22 insertions(+), 55 deletions(-) diff --git a/seeker/scraper/spiders/indeed.py b/seeker/scraper/spiders/indeed.py index 73d9672..cfa4b8b 100644 --- a/seeker/scraper/spiders/indeed.py +++ b/seeker/scraper/spiders/indeed.py @@ -1,60 +1,27 @@ -import scrapy - -from scrapy.spiders import Spider -from scrapy.selector import Selector - +from scrapy.spiders import XMLFeedSpider from scraper.items import JobItem from django.utils import timezone -class IndeedSpider(Spider): - name = "indeed" - allowed_domains = ["indeed.com"] - # start_urls = ["https://www.indeed.com/q-Django-l-remote-jobs.html"] - start_urls = [ - "https://www.indeed.com/jobs?q=django&l=Remote&limit=50", - ] - handle_httpstatus_list = [301, 302] - - # def start_requests(self): - # # location = "" - # # distance = "" - # # search_terms = "django" - # # search_query = "q=" - # base_url = "https://www.indeed.com/jobs?q=Django&l=remote" - # start_urls = [] - # start_urls.append(base_url) - - # return [scrapy.http.Request(url=start_url) for start_url in start_urls] - - def parse(self, response): - # self.log('\n Crawling %s\n' % response.url) - hxs = Selector(response) - jobs = hxs.xpath('//td[@id="resultsCol"]/div') - items = [] - for job in jobs: - item = JobItem() - - item['title'] = job.xpath('h2/a/@title/text()').extract() - item['url'] = job.xpath('h2/a').extract() - item['location'] = str('n/a') - - # item['location'] = job.xpath('span[@class="location"]/span/text()').extract() - - # Not all entries have a company - if job.xpath("span[@class='company']/text()").extract() == []:item['company'] = [u''] - else: - item['company'] = job.xpath("span[@class='company']/text()").extract() - item["email"] = str('n/a') - item['body'] = job.xpath( - "table/tr/td/span[@class='summary']").extract() - item['salary'] = job.xpath( - "table/tr/td/span[@class='source']/text()").extract() - item['pub_date'] = job.xpath( - "table/tr/td/span[@class='date']/text()").extract() - item["scrape_date"] = timezone.now() - item["job_board"] = "Indeed" - item["board_url"] = "www.indeed.com" - items.append(item) - return items +class IndeedSpider(XMLFeedSpider): + name = 'indeed' + allowed_domains = ['indeed.com'] + start_urls = ['http://rss.indeed.com/rss?q=django&l=remote'] + iterator = 'iternodes' # This is actually unnecessary, since it's the default value + itertag = 'item' + + def parse_node(self, response, node): + item = JobItem() + item['title'] = node.xpath('title/text()').extract_first() + item['company'] = node.xpath('source/text()').extract_first() + item['body'] = node.xpath('description/text()').extract() + item['pub_date'] = node.xpath('pubDate/text()').extract_first() + item['url'] = node.xpath('link/text()').extract_first() + item["scrape_date"] = timezone.now() + item["job_board"] = "Indeed" + item["board_url"] = "www.indeed.com" + item["email"] = str('n/a') + item["salary"] = str('n/a') + item['location'] = str('n/a') + return item From a227db110e35ca3d27b84a8ddbac4c0b8f9e68a5 Mon Sep 17 00:00:00 2001 From: Nathan Workman Date: Tue, 3 Apr 2018 00:32:57 -0400 Subject: [PATCH 3/3] Use CrawlerProcess to run multiple scrapy crawlers simultaneously --- README.md | 16 ++++++++++------ seeker/crawl.py | 12 ++++++++++++ 2 files changed, 22 insertions(+), 6 deletions(-) create mode 100644 seeker/crawl.py diff --git a/README.md b/README.md index 40cc3d4..4d6ea9e 100644 --- a/README.md +++ b/README.md @@ -67,20 +67,24 @@ Navigate to the django admin to view your results. - [ ] Celery Beat - run spiders on a schedule. #### Spiders -Want a spider not listed here? Feel free to open a pull request and add it to the list or implement the spider yourself. +Want a spider not listed here? Feel free to open a pull request and add it to the list or implement the spider yourself. + - [x] [Stack Overflow](https://www.stackoverflow.com/jobs) -- [ ] [Indeed](https://www.indeed.com) -- [ ] [Dice](http://dice.com) +- [x] [Indeed](https://www.indeed.com) - [ ] [Angel.co](https://angel.co/) - [ ] [RemotePython](https://www.remotepython.com) - [ ] [DjangoJobs](https://djangojobs.net/jobs/) - [ ] [DjangoGigs](https://djangogigs.com) - [ ] [Jobspresso](http://jobspresso.co) -- [ ] [Authentic Jobs](http://authenticjobs.com/) - [ ] [We Work Remotely](https://weworkremotely.com/) -- [ ] [Remotive](https://remotive.io) - [ ] [Python.org](https://www.python.org/jobs/) - +- [ ] [Working Nomads](https://www.workingnomads.co/jobs) +- [ ] [Remote Work Hub](https://remoteworkhub.com) +- [ ] [Telecommunity](http://remotejobs.telecommunity.net/#s=1) +- [ ] [Remote Base](https://remotebase.io/) +- [ ] [WFH](https://www.wfh.io) +- [ ] [Remote Ok](https://remoteok.io) +- [ ] [Remotely Awesome Job](https://www.remotelyawesomejobs.com/remote-django-jobs) diff --git a/seeker/crawl.py b/seeker/crawl.py new file mode 100644 index 0000000..2cc75ad --- /dev/null +++ b/seeker/crawl.py @@ -0,0 +1,12 @@ +from scrapy.utils.project import get_project_settings +from scrapy.crawler import CrawlerProcess + +setting = get_project_settings() +process = CrawlerProcess(setting) +# https://doc.scrapy.org/en/latest/topics/api.html#scrapy.crawler.CrawlerProcess + +for spider in process.spiders.list(): + print("Running spider %s" % (spider)) + process.crawl(spider) + +process.start()