FEATURE: parse sites with anti-bot defence

EgorBodnar · Feb 7, 2023 · 07875c5 · 07875c5
1 parent 0ba2f93
commit 07875c5
Show file tree

Hide file tree

Showing 6 changed files with 486 additions and 113 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,104 +1,5 @@
-# Logs
-logs
-*.log
-npm-debug.log*
-yarn-debug.log*
-yarn-error.log*
-lerna-debug.log*
-
-# Diagnostic reports (https://nodejs.org/api/report.html)
-report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
-
-# Runtime data
-pids
-*.pid
-*.seed
-*.pid.lock
-
-# Directory for instrumented libs generated by jscoverage/JSCover
-lib-cov
-
-# Coverage directory used by tools like istanbul
-coverage
-*.lcov
-
-# nyc test coverage
-.nyc_output
-
-# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
-.grunt
-
-# Bower dependency directory (https://bower.io/)
-bower_components
-
-# node-waf configuration
-.lock-wscript
-
-# Compiled binary addons (https://nodejs.org/api/addons.html)
-build/Release
-
 # Dependency directories
 node_modules/
-jspm_packages/
-
-# TypeScript v1 declaration files
-typings/
-
-# TypeScript cache
-*.tsbuildinfo
-
-# Optional npm cache directory
-.npm
-
-# Optional eslint cache
-.eslintcache
-
-# Microbundle cache
-.rpt2_cache/
-.rts2_cache_cjs/
-.rts2_cache_es/
-.rts2_cache_umd/
-
-# Optional REPL history
-.node_repl_history
-
-# Output of 'npm pack'
-*.tgz
-
-# Yarn Integrity file
-.yarn-integrity
-
-# dotenv environment variables file
-.env
-.env.test
-
-# parcel-bundler cache (https://parceljs.org/)
-.cache
-
-# Next.js build output
-.next
-
-# Nuxt.js build / generate output
-.nuxt
-dist
-
-# Gatsby files
-.cache/
-# Comment in the public line in if your project uses Gatsby and *not* Next.js
-# https://nextjs.org/blog/next-9-1#public-directory-support
-# public
-
-# vuepress build output
-.vuepress/dist
-
-# Serverless directories
-.serverless/
-
-# FuseBox cache
-.fusebox/
-
-# DynamoDB Local files
-.dynamodb/
 
-# TernJS port file
-.tern-port
+# Fake Browser UserData directories
+fakeBrowserUserData/
diff --git a/README.md b/README.md
@@ -14,7 +14,8 @@ This means it operates independently of your technology stack and doesn't add an
 * Define the list of required keywords to match when parsing the job title in `JOB_KEYWORDS` `config/default.json`
 * Set Telegram bot token. `TELEGRAM.TOKEN` `config/default.json`| [**How to generate the TOKEN**](https://medium.com/geekculture/generate-telegram-token-for-bot-api-d26faf9bf064)
 * Set id of the Telegram chat to send job alerts to. `TELEGRAM.CHAT_ID` `config/default.json`| Don't forget to add the bot to this chat.
-* Extend the Job Sites list by the required ones in `config/jobSites.json`. Note: `jobTitleSelector` - css selector of the job post title. 
+* Extend the Job Sites list by the required ones in `config/jobSites.json`. Note: `jobTitleSelector` - css selector of the job post title.
+If there is some anti bot system except captcha, enable flag `antiBotCheck` and the Job Scanner will parse it anyway by FakeBrowser.
 * It scans sites each 30 minutes. The interval parameter could be configured in `SCAN_INTERVAL_MINUTES` `config/default.json`.
 
 ## ▶️ How to start the scanner:

diff --git a/index.js b/index.js
@@ -1,8 +1,7 @@
-const fs = require('fs');
-const axios = require('axios');
-const cheerio = require('cheerio');
-const { Telegraf } = require('telegraf');
-const { MongoClient } = require('mongodb');
+import fs from 'fs';
+import { Telegraf } from 'telegraf';
+import { MongoClient } from 'mongodb';
+import { getJobTitlesByFakeBrowser, getJobTitlesByAxios }  from './jobTitleParser.js';
 
 const MONGODB_URI = 'mongodb://mongo:27017';
 
@@ -20,15 +19,17 @@ const parseJobSites = async () => {
 
   for (const site of jobSites) {
     try {
-      const response = await axios.get(site.url);
-      const html = response.data;
-      const $ = await cheerio.load(html);
-      const jobTitles = await $(site.jobTitleSelector);
+      let jobTitles = [];
+      if ( site.antiBotCheck ) {
+        jobTitles = await getJobTitlesByFakeBrowser(site)
+      } else {
+        jobTitles = await getJobTitlesByAxios(site)
+      }
 
       console.info(`Parsing ${site.name}'s job list`);
 
       for (let i = 0; i < jobTitles.length; i++) {
-        const jobTitle = $(jobTitles[i]).text().toLowerCase().trim();
+        const jobTitle = jobTitles[i].toLowerCase().trim()
 
         const isMatchingJob = config.JOB_KEYWORDS.some((keyword) =>
           jobTitle.includes(keyword)

diff --git a/jobTitleParser.js b/jobTitleParser.js
@@ -0,0 +1,44 @@
+import axios from 'axios';
+import cheerio from 'cheerio';
+import { faker } from '@faker-js/faker';
+import { FakeBrowser } from 'fakebrowser';
+
+const getJobTitlesByFakeBrowser = async (site) => {
+
+  const builder = new FakeBrowser.Builder()
+    .displayUserActionLayer(true)
+    .vanillaLaunchOptions({
+      headless: true,
+    })
+    .userDataDir('./fakeBrowserUserData');
+
+  const fakeBrowser = await builder.launch();
+  let jobTitles = [];
+
+  try {
+    const page = await fakeBrowser.vanillaBrowser.newPage();
+    await page.goto(site.url, {waitUntil: 'domcontentloaded'});
+    await page.waitForSelector(site.jobTitleSelector);
+    jobTitles = await page.$$eval(site.jobTitleSelector,
+      elements=> elements.map(item=>item.textContent))
+
+  } finally {
+    await fakeBrowser.shutdown();
+  }
+  return jobTitles
+}
+
+const getJobTitlesByAxios = async (site) => {
+  const response = await axios.get(site.url, { headers: { 'User-Agent': faker.internet.userAgent() }  });
+  const html = response.data;
+  const $ = await cheerio.load(html);
+
+  let jobTitles = [];
+  const jobTitlesElements = await $(site.jobTitleSelector);
+  for (let i = 0; i < jobTitlesElements.length; i++) {
+    jobTitles[i] = $(jobTitlesElements[i]).text();
+  }
+  return jobTitles;
+}
+
+export { getJobTitlesByFakeBrowser, getJobTitlesByAxios };