Merge pull request #7 from NathanWorkman/feature/indeed

Indeed mvp
NathanWorkman · Apr 3, 2018 · 489b688 · 489b688
2 parents d048965 + a227db1
commit 489b688
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -67,20 +67,24 @@ Navigate to the django admin to view your results.
 - [ ] Celery Beat - run spiders on a schedule.
 
 #### Spiders
-Want a spider not listed here? Feel free to open a pull request and add it to the list or implement the spider yourself. 
+Want a spider not listed here? Feel free to open a pull request and add it to the list or implement the spider yourself.
+
 - [x] [Stack Overflow](https://www.stackoverflow.com/jobs)
-- [ ] [Indeed](https://www.indeed.com)
-- [ ] [Dice](http://dice.com)
+- [x] [Indeed](https://www.indeed.com)
 - [ ] [Angel.co](https://angel.co/)
 - [ ] [RemotePython](https://www.remotepython.com)
 - [ ] [DjangoJobs](https://djangojobs.net/jobs/)
 - [ ] [DjangoGigs](https://djangogigs.com)
 - [ ] [Jobspresso](http://jobspresso.co)
-- [ ] [Authentic Jobs](http://authenticjobs.com/)
 - [ ] [We Work Remotely](https://weworkremotely.com/)
-- [ ] [Remotive](https://remotive.io)
 - [ ] [Python.org](https://www.python.org/jobs/)
-
+- [ ] [Working Nomads](https://www.workingnomads.co/jobs)
+- [ ] [Remote Work Hub](https://remoteworkhub.com)
+- [ ] [Telecommunity](http://remotejobs.telecommunity.net/#s=1)
+- [ ] [Remote Base](https://remotebase.io/)
+- [ ] [WFH](https://www.wfh.io)
+- [ ] [Remote Ok](https://remoteok.io)
+- [ ] [Remotely Awesome Job](https://www.remotelyawesomejobs.com/remote-django-jobs)
 
 
 

diff --git a/seeker/crawl.py b/seeker/crawl.py
@@ -0,0 +1,12 @@
+from scrapy.utils.project import get_project_settings
+from scrapy.crawler import CrawlerProcess
+
+setting = get_project_settings()
+process = CrawlerProcess(setting)
+# https://doc.scrapy.org/en/latest/topics/api.html#scrapy.crawler.CrawlerProcess
+
+for spider in process.spiders.list():
+    print("Running spider %s" % (spider))
+    process.crawl(spider)
+
+process.start()
diff --git a/seeker/scraper/spiders/indeed.py b/seeker/scraper/spiders/indeed.py
@@ -0,0 +1,27 @@
+from scrapy.spiders import XMLFeedSpider
+from scraper.items import JobItem
+
+from django.utils import timezone
+
+
+class IndeedSpider(XMLFeedSpider):
+    name = 'indeed'
+    allowed_domains = ['indeed.com']
+    start_urls = ['http://rss.indeed.com/rss?q=django&l=remote']
+    iterator = 'iternodes'  # This is actually unnecessary, since it's the default value
+    itertag = 'item'
+
+    def parse_node(self, response, node):
+        item = JobItem()
+        item['title'] = node.xpath('title/text()').extract_first()
+        item['company'] = node.xpath('source/text()').extract_first()
+        item['body'] = node.xpath('description/text()').extract()
+        item['pub_date'] = node.xpath('pubDate/text()').extract_first()
+        item['url'] = node.xpath('link/text()').extract_first()
+        item["scrape_date"] = timezone.now()
+        item["job_board"] = "Indeed"
+        item["board_url"] = "www.indeed.com"
+        item["email"] = str('n/a')
+        item["salary"] = str('n/a')
+        item['location'] = str('n/a')
+        return item