Skip to content

Commit

Permalink
added flyscrape.config.js
Browse files Browse the repository at this point in the history
  • Loading branch information
HariSekhon committed Sep 25, 2024
1 parent 60b1dee commit 8f458c8
Showing 1 changed file with 115 additions and 0 deletions.
115 changes: 115 additions & 0 deletions flyscrape.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
// [% VIM_TAGS %]
//
// Author: Hari Sekhon
// Date: [% DATE # 2008-10-20 16:20:20 +0100 (Mon, 20 Oct 2008) %]
//
// [% URL %]
//
// [% LICENSE %]
//
// [% MESSAGE %]
//
// [% LINKEDIN %]
//

export const config = {

// follows redirects eg. to https://news.ycombinator.com
url: "https://hackernews.com",

// Enable rendering with headless browser. (default = false)
//browser: true,

// Specify if browser should be headless or not. (default = true)
//headless: false,

// Specify the multiple URLs to start scraping from. (default = [])
// urls: [
// "https://anothersite.com/",
// "https://yetanother.com/",
// ],

// Specify how deep links should be followed. (default = 0, no follow)
// depth: 5,

// Speficy the css selectors to follow. (default = ["a[href]"])
// follow: [".next > a", ".related a"],

// Specify the allowed domains. ['*'] for all. (default = domain from url)
// allowedDomains: ["example.com", "anothersite.com"],

// Specify the blocked domains. (default = none)
// blockedDomains: ["somesite.com"],

// Specify the allowed URLs as regex. (default = all allowed)
// allowedURLs: ["/posts", "/articles/\d+"],

// Specify the blocked URLs as regex. (default = none)
// blockedURLs: ["/admin"],

// Specify the rate in requests per minute. (default = no rate limit)
// rate: 60,

// Specify the number of concurrent requests. (default = no limit)
// concurrency: 1,

// Specify a single HTTP(S) proxy URL. (default = no proxy)
// Note: Not compatible with browser mode.
// proxy: "http://someproxy.com:8043",

// Specify multiple HTTP(S) proxy URLs. (default = no proxy)
// Note: Not compatible with browser mode.
// proxies: [
// "http://someproxy.com:8043",
// "http://someotherproxy.com:8043",
// ],

// XXX: use this in development mode to not have to repeatedly request the page
// Enable file-based request caching. (default = no cache)
cache: "file",

// Specify the HTTP request header. (default = none)
// headers: {
// "Authorization": "Bearer ...",
// "User-Agent": "Mozilla ...",
// },

// Use the cookie store of your local browser. (default = off)
// XXX: use this for sites that require being logged in
// Options: "chrome" | "edge" | "firefox"
// cookies: "chrome",

// Specify the output options.
// output: {
// // Specify the output file. (default = stdout)
// file: "results.json",
//
// // Specify the output format. (default = json)
// // Options: "json" | "ndjson"
// format: "json",
// },
};

export default function({ doc, absoluteURL }) {
const title = doc.find("h1");
//const link = doc.find("a");
const posts = doc.find(".athing");

return {
title: title.text(),
posts: posts.map((post) => {
const link = post.find(".titleline > a");
const meta = post.next();

return {
title: link.text(),
url: absoluteURL(link.attr("href")),
user: meta.find(".hnuser").text(),
};
}),
//link: {
// text: link.text(),
// url: absoluteURL(link.attr("href")),
//},
};
}

0 comments on commit 8f458c8

Please sign in to comment.