Merge pull request #21 from NathanWorkman/release/v0.5.0

Release/v0.5.0
NathanWorkman · May 14, 2018 · 60ea25f · 60ea25f
2 parents a6d620d + 1b716f6
commit 60ea25f
Show file tree

Hide file tree

Showing 18 changed files with 642 additions and 412 deletions.
diff --git a/Makefile b/Makefile
@@ -149,3 +149,22 @@ build:
 		cd seeker; \
 		gulp build; \
 	)
+
+
+deploy_staging:
+# deploy to staging server
+	$(call ECHO_BLUE, deploy changes to the STAGING server... )
+	(\
+		ssh-add -K; \
+		cd ansible; \
+		$(PLAYBOOK) -i hosts deploy_staging.yml --verbose --extra-vars branch=$(branch);  \
+	)
+
+deploy_production:
+# deploy to production server
+	$(call ECHO_RED, deploy changes to the PRODUCTION server... )
+	(\
+		ssh-add -K; \
+		cd ansible; \
+		$(PLAYBOOK) -i hosts deploy_production.yml --verbose --extra-vars branch=$(branch);  \
+	)
diff --git a/README.md b/README.md
@@ -3,9 +3,7 @@
 [![Build Status](https://travis-ci.org/NathanWorkman/seeker.svg?branch=master)](https://travis-ci.org/NathanWorkman/seeker)
 
 ## What is Seeker?
-Seeker aims not to be a job board for everyone, but a job board for you.
-
-Inevitably the time will come when you are on the hunt for a new job. Let Seeker do the leg work for you. Check multiple job boards for positions you might be interested in and organize them all in one convenient location.
+Seeker is just another job board aggregator. Let Seeker do the leg work for you. Check multiple job boards for positions you might be interested in and organize them all in one convenient location.
 
 Currently, the search terms are hard coded to remote django and remote python positions - you'll need to manually update these for now.
 
@@ -27,8 +25,6 @@ Install virtualenv
 pip install virtualenv
 ```
 
-I would recommend installing [virtualenv](https://virtualenv.readthedocs.io/).
-
 ### To run the project
 ```
 git clone git@github.com:NathanWorkman/seeker.git
@@ -56,50 +52,16 @@ and then run the following to run the individual spiders, replacing `spidername`
 scrapy crawl spidername
 ```
 
-Currently only the StackOverflow spider is working, others to come soon. 
+or run all the spiders at once:
 
 ```
-scrapy crawl stackoverflow
+python crawl.py
 ```
 
-
 Navigate to the django admin to view your results.
 
 
-## TODO
-
-#### Future features.
-- [x] Simple UI
-- [ ] Enhanced UI
-- [x] Pagination
-- [ ] Breadcrumbs Navigation
-- [ ] Settings Panel 
-- [ ] Move all environmental variables to .env using PyEnv.
-- [ ] Save/Favorite Job Postings
-- [ ] Tag/Skill Views
-- [ ] Full-Time, Part-Time, Contract
-- [ ] Email Notifications - send daily, weekly, monthly email notifications of new job postings.
-- [ ] Celery Beat - run spiders on a schedule.
-
-#### Spiders
-Want a spider not listed here? Feel free to open a pull request and add it to the list or implement the spider yourself.
-
-- [x] [Stack Overflow](https://www.stackoverflow.com/jobs)
-- [x] [Indeed](https://www.indeed.com)
-- [ ] [Angel.co](https://angel.co/)
-- [x] [RemotePython](https://www.remotepython.com)
-- [ ] [DjangoJobs](https://djangojobs.net/jobs/)
-- [x] [DjangoGigs](https://djangogigs.com)
-- [ ] [Jobspresso](http://jobspresso.co)
-- [ ] [We Work Remotely](https://weworkremotely.com/)
-- [x] [Python.org](https://www.python.org/jobs/)
-- [ ] [Working Nomads](https://www.workingnomads.co/jobs)
-- [ ] [Remote Work Hub](https://remoteworkhub.com)
-- [ ] [Telecommunity](http://remotejobs.telecommunity.net/#s=1)
-- [ ] [Remote Base](https://remotebase.io/)
-- [ ] [WFH](https://www.wfh.io)
-- [ ] [Remote Ok](https://remoteok.io)
-- [ ] [Remotely Awesome Job](https://www.remotelyawesomejobs.com/remote-django-jobs)
+**This project is for educational purposes only, I'm not responsible for what you do with the code.**
 
 
 

diff --git a/seeker/job/admin.py b/seeker/job/admin.py
@@ -1,6 +1,7 @@
 from django.contrib import admin
 
-from .models import Job, Board
+from .models import Job, Board, SearchTerms
 
 admin.site.register(Board)
 admin.site.register(Job)
+admin.site.register(SearchTerms)
diff --git a/seeker/job/migrations/0002_auto_20180514_1954.py b/seeker/job/migrations/0002_auto_20180514_1954.py
@@ -0,0 +1,24 @@
+# Generated by Django 2.0.5 on 2018-05-14 19:54
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('job', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='SearchTerms',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('term', models.CharField(max_length=55)),
+            ],
+        ),
+        migrations.AlterModelOptions(
+            name='job',
+            options={'ordering': ['scrape_date']},
+        ),
+    ]
diff --git a/seeker/job/models.py b/seeker/job/models.py
@@ -50,3 +50,14 @@ def __str__(self):
     def get_absolute_url(self):
         """Get job detail url."""
         return reverse("job_detail", args=[str(self.pk)])
+
+    def get_count(self):
+
+        return self.objects.all().count()
+
+
+class SearchTerms(models.Model):
+    term = models.CharField(max_length=55)
+
+    def __str__(self):
+        return u"%s" % self.term
diff --git a/seeker/job/views.py b/seeker/job/views.py
@@ -13,9 +13,10 @@ class JobListView(ListView):
     """Job List View."""
     paginate_by = 10
     model = Job
+    context_object_name = 'job_list'
+    queryset = Job.objects.order_by('-scrape_date')
 
     def get_context_data(self, *args, **kwargs):
         context = super(JobListView, self).get_context_data(*args, **kwargs)
-        context['jobs_list'] = Job.objects.all()
-        context['jobs_count'] = Job.objects.all().count()
+        context['job_count'] = Job.objects.all().count()
         return context
diff --git a/seeker/scraper/spiders/greenhouse.py b/seeker/scraper/spiders/greenhouse.py
@@ -0,0 +1,49 @@
+import scrapy
+from scrapy.spiders import Spider
+from scrapy.selector import Selector
+from scraper.items import JobItem
+from scrapy.http import Request
+
+from django.utils import timezone
+
+
+class GreenHouseSpider(Spider):
+    name = "greenhouse"
+    allowed_domains = ["google.com"]
+
+    def start_requests(self):
+        search_query = "q=site:greenhouse.io+django&tbs=qdr:m"
+        base_url = "https://www.google.com/search?"
+        start_urls = []
+
+        start_urls.append(base_url + search_query)
+
+        return [scrapy.http.Request(url=start_url) for start_url in start_urls]
+
+    def parse(self, response):
+        """Extract job detail urls from response."""
+        hxs = Selector(response)
+        urls = hxs.xpath('//cite/text()').extract()
+        for url in urls:
+            yield Request(url, callback=self.parse_detail_pages, dont_filter=True)
+            print(url)
+
+    def parse_detail_pages(self, response):
+        hxs = Selector(response)
+        jobs = hxs.xpath('//div[contains(@id, "app_body")]')
+        items = []
+        for job in jobs:
+            item = JobItem()
+            item["title"] = job.xpath('//h1[contains(@class, "app-title")]/text()').extract_first()
+            item["company"] = str('n/a')
+            item["body"] = job.xpath('//div[contains(@id, "content")]').extract()
+            item["location"] = job.xpath('//div[contains(@class, "location")]').extract_first()
+            item["url"] = response.request.url
+            item["pub_date"] = str('n/a')
+            item["email"] = str('n/a')
+            item["salary"] = str('n/a')
+            item["scrape_date"] = timezone.now()
+            item["job_board"] = "Greenhouse"
+            item["board_url"] = "www.greenhouse.io"
+            items.append(item)
+        return items
diff --git a/seeker/scraper/spiders/lever.py b/seeker/scraper/spiders/lever.py
@@ -0,0 +1,49 @@
+import scrapy
+from scrapy.spiders import Spider
+from scrapy.selector import Selector
+from scraper.items import JobItem
+from scrapy.http import Request
+
+from django.utils import timezone
+
+
+class LeverSpider(Spider):
+    name = "lever"
+    allowed_domains = ["google.com"]
+
+    def start_requests(self):
+        search_query = "q=site:lever.co+django&tbs=qdr:m"
+        base_url = "https://www.google.com/search?"
+        start_urls = []
+
+        start_urls.append(base_url + search_query)
+
+        return [scrapy.http.Request(url=start_url) for start_url in start_urls]
+
+    def parse(self, response):
+        """Extract job detail urls from response."""
+        hxs = Selector(response)
+        urls = hxs.xpath('//cite/text()').extract()
+        for url in urls:
+            yield Request(url, callback=self.parse_detail_pages, dont_filter=True)
+            print(url)
+
+    def parse_detail_pages(self, response):
+        hxs = Selector(response)
+        jobs = hxs.xpath('//div[contains(@class, "content")]')
+        items = []
+        for job in jobs:
+            item = JobItem()
+            item["title"] = job.xpath('//div[contains(@class, "posting-headline")]/h2/text()').extract_first()
+            item["company"] = str('n/a')
+            item["body"] = job.xpath('//div[contains(@class, "section page-centered")]').extract()
+            item["location"] = job.xpath('//div[contains(@class, "sort-by-time posting-category medium-category-label")]').extract_first()
+            item["url"] = response.request.url
+            item["pub_date"] = str('n/a')
+            item["email"] = str('n/a')
+            item["salary"] = str('n/a')
+            item["scrape_date"] = timezone.now()
+            item["job_board"] = "Lever"
+            item["board_url"] = "lever.co"
+            items.append(item)
+        return items
diff --git a/seeker/scraper/spiders/recruiterbox.py b/seeker/scraper/spiders/recruiterbox.py
@@ -0,0 +1,49 @@
+import scrapy
+from scrapy.spiders import Spider
+from scrapy.selector import Selector
+from scraper.items import JobItem
+from scrapy.http import Request
+
+from django.utils import timezone
+
+
+class RecruiterBoxSpider(Spider):
+    name = "recruiterbox"
+    allowed_domains = ["google.com"]
+
+    def start_requests(self):
+        search_query = "q=site:recruiterbox.com+django&tbs=qdr:m"
+        base_url = "https://www.google.com/search?"
+        start_urls = []
+
+        start_urls.append(base_url + search_query)
+
+        return [scrapy.http.Request(url=start_url) for start_url in start_urls]
+
+    def parse(self, response):
+        """Extract job detail urls from response."""
+        hxs = Selector(response)
+        urls = hxs.xpath('//cite/text()').extract()
+        for url in urls:
+            yield Request(url, callback=self.parse_detail_pages, dont_filter=True)
+            print(url)
+
+    def parse_detail_pages(self, response):
+        hxs = Selector(response)
+        jobs = hxs.xpath('//div[contains(@id, "content")]')
+        items = []
+        for job in jobs:
+            item = JobItem()
+            item["title"] = job.xpath('//h1[contains(@class, "jobtitle")]/text()').extract_first()
+            item["company"] = str('n/a')
+            item["body"] = job.xpath('//div[contains(@class, "jobdesciption")]').extract()
+            item["location"] = job.xpath('//span[contains(@class, "meta-job-location-city")]').extract()
+            item["url"] = response.request.url
+            item["pub_date"] = str('n/a')
+            item["email"] = str('n/a')
+            item["salary"] = str('n/a')
+            item["scrape_date"] = timezone.now()
+            item["job_board"] = "Recruiter Box"
+            item["board_url"] = "www.recruiterbox.com"
+            items.append(item)
+        return items
diff --git a/seeker/scraper/spiders/workable.py b/seeker/scraper/spiders/workable.py
@@ -0,0 +1,50 @@
+import scrapy
+from scrapy.spiders import Spider
+from scrapy.selector import Selector
+from scraper.items import JobItem
+from scrapy.http import Request
+
+from django.utils import timezone
+
+
+class WorkableSpider(Spider):
+    name = "workable"
+    allowed_domains = ["google.com"]
+
+    def start_requests(self):
+        search_query = "q=site:workable.com+django&tbs=qdr:m"
+        base_url = "https://www.google.com/search?"
+        start_urls = []
+
+        start_urls.append(base_url + search_query)
+
+        return [scrapy.http.Request(url=start_url) for start_url in start_urls]
+
+    def parse(self, response):
+        """Extract job detail urls from response."""
+        hxs = Selector(response)
+        urls = hxs.xpath('//cite/text()').extract()
+        for url in urls:
+            yield Request(url, callback=self.parse_detail_pages, dont_filter=True)
+            print(url)
+
+    def parse_detail_pages(self, response):
+        hxs = Selector(response)
+        jobs = hxs.xpath('//main[contains(@class, "stacked")]')
+        items = []
+        for job in jobs:
+            item = JobItem()
+            item["title"] = job.xpath('//h1/text()').extract_first()
+            item["company"] = str('n/a')
+            item["body"] = job.xpath('//main[contains(@class, "stacked")]').extract()
+            item["location"] = job.xpath('//p[contains(@class, "meta")]').extract_first()
+            item["url"] = response.request.url
+            item["pub_date"] = str('n/a')
+            item["email"] = str('n/a')
+            item["salary"] = str('n/a')
+            # item["tags"] = job.css('.-tags p a.post-tag::text').extract()
+            item["scrape_date"] = timezone.now()
+            item["job_board"] = "Workable"
+            item["board_url"] = "www.workable.com"
+            items.append(item)
+        return items