Skip to content

Commit

Permalink
FEATURE: parse sites with anti-bot defence
Browse files Browse the repository at this point in the history
  • Loading branch information
EgorBodnar committed Feb 7, 2023
1 parent 0ba2f93 commit 07875c5
Show file tree
Hide file tree
Showing 6 changed files with 486 additions and 113 deletions.
103 changes: 2 additions & 101 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,104 +1,5 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*

# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json

# Runtime data
pids
*.pid
*.seed
*.pid.lock

# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov

# Coverage directory used by tools like istanbul
coverage
*.lcov

# nyc test coverage
.nyc_output

# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt

# Bower dependency directory (https://bower.io/)
bower_components

# node-waf configuration
.lock-wscript

# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release

# Dependency directories
node_modules/
jspm_packages/

# TypeScript v1 declaration files
typings/

# TypeScript cache
*.tsbuildinfo

# Optional npm cache directory
.npm

# Optional eslint cache
.eslintcache

# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/

# Optional REPL history
.node_repl_history

# Output of 'npm pack'
*.tgz

# Yarn Integrity file
.yarn-integrity

# dotenv environment variables file
.env
.env.test

# parcel-bundler cache (https://parceljs.org/)
.cache

# Next.js build output
.next

# Nuxt.js build / generate output
.nuxt
dist

# Gatsby files
.cache/
# Comment in the public line in if your project uses Gatsby and *not* Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public

# vuepress build output
.vuepress/dist

# Serverless directories
.serverless/

# FuseBox cache
.fusebox/

# DynamoDB Local files
.dynamodb/

# TernJS port file
.tern-port
# Fake Browser UserData directories
fakeBrowserUserData/
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ This means it operates independently of your technology stack and doesn't add an
* Define the list of required keywords to match when parsing the job title in `JOB_KEYWORDS` `config/default.json`
* Set Telegram bot token. `TELEGRAM.TOKEN` `config/default.json`| [**How to generate the TOKEN**](https://medium.com/geekculture/generate-telegram-token-for-bot-api-d26faf9bf064)
* Set id of the Telegram chat to send job alerts to. `TELEGRAM.CHAT_ID` `config/default.json`| Don't forget to add the bot to this chat.
* Extend the Job Sites list by the required ones in `config/jobSites.json`. Note: `jobTitleSelector` - css selector of the job post title.
* Extend the Job Sites list by the required ones in `config/jobSites.json`. Note: `jobTitleSelector` - css selector of the job post title.
If there is some anti bot system except captcha, enable flag `antiBotCheck` and the Job Scanner will parse it anyway by FakeBrowser.
* It scans sites each 30 minutes. The interval parameter could be configured in `SCAN_INTERVAL_MINUTES` `config/default.json`.

## ▶️ How to start the scanner:
Expand Down
21 changes: 11 additions & 10 deletions index.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
const fs = require('fs');
const axios = require('axios');
const cheerio = require('cheerio');
const { Telegraf } = require('telegraf');
const { MongoClient } = require('mongodb');
import fs from 'fs';
import { Telegraf } from 'telegraf';
import { MongoClient } from 'mongodb';
import { getJobTitlesByFakeBrowser, getJobTitlesByAxios } from './jobTitleParser.js';

const MONGODB_URI = 'mongodb://mongo:27017';

Expand All @@ -20,15 +19,17 @@ const parseJobSites = async () => {

for (const site of jobSites) {
try {
const response = await axios.get(site.url);
const html = response.data;
const $ = await cheerio.load(html);
const jobTitles = await $(site.jobTitleSelector);
let jobTitles = [];
if ( site.antiBotCheck ) {
jobTitles = await getJobTitlesByFakeBrowser(site)
} else {
jobTitles = await getJobTitlesByAxios(site)
}

console.info(`Parsing ${site.name}'s job list`);

for (let i = 0; i < jobTitles.length; i++) {
const jobTitle = $(jobTitles[i]).text().toLowerCase().trim();
const jobTitle = jobTitles[i].toLowerCase().trim()

const isMatchingJob = config.JOB_KEYWORDS.some((keyword) =>
jobTitle.includes(keyword)
Expand Down
44 changes: 44 additions & 0 deletions jobTitleParser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import axios from 'axios';
import cheerio from 'cheerio';
import { faker } from '@faker-js/faker';
import { FakeBrowser } from 'fakebrowser';

const getJobTitlesByFakeBrowser = async (site) => {

const builder = new FakeBrowser.Builder()
.displayUserActionLayer(true)
.vanillaLaunchOptions({
headless: true,
})
.userDataDir('./fakeBrowserUserData');

const fakeBrowser = await builder.launch();
let jobTitles = [];

try {
const page = await fakeBrowser.vanillaBrowser.newPage();
await page.goto(site.url, {waitUntil: 'domcontentloaded'});
await page.waitForSelector(site.jobTitleSelector);
jobTitles = await page.$$eval(site.jobTitleSelector,
elements=> elements.map(item=>item.textContent))

} finally {
await fakeBrowser.shutdown();
}
return jobTitles
}

const getJobTitlesByAxios = async (site) => {
const response = await axios.get(site.url, { headers: { 'User-Agent': faker.internet.userAgent() } });
const html = response.data;
const $ = await cheerio.load(html);

let jobTitles = [];
const jobTitlesElements = await $(site.jobTitleSelector);
for (let i = 0; i < jobTitlesElements.length; i++) {
jobTitles[i] = $(jobTitlesElements[i]).text();
}
return jobTitles;
}

export { getJobTitlesByFakeBrowser, getJobTitlesByAxios };
Loading

0 comments on commit 07875c5

Please sign in to comment.