Skip to content

Commit

Permalink
Merge branch 'rootedbox-refactor_webcrawling' into preview
Browse files Browse the repository at this point in the history
  • Loading branch information
sestinj committed Mar 17, 2024
2 parents 4f7d8a8 + b81946d commit 864c34e
Show file tree
Hide file tree
Showing 7 changed files with 165 additions and 212 deletions.
21 changes: 7 additions & 14 deletions core/indexing/docs/article.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@ import { Readability } from "@mozilla/readability";
import { Chunk } from "../..";
import { MAX_CHUNK_SIZE } from "../../llm/constants";
import { cleanFragment, cleanHeader } from "../chunk/markdown";
import { PageData } from "./crawl"

type ArticleComponent = {
export type ArticleComponent = {
title: string;
body: string;
};

type Article = {
export type Article = {
url: string;
subpath: string;
title: string;
Expand Down Expand Up @@ -132,6 +133,7 @@ export async function stringToArticle(
}

let article_components = await extractTitlesAndBodies(article.content);

return {
url,
subpath,
Expand All @@ -144,20 +146,11 @@ export async function stringToArticle(
}
}

export async function urlToArticle(
subpath: string,
baseUrl: URL,
export async function pageToArticle(
page: PageData
): Promise<Article | undefined> {
const url = new URL(subpath, baseUrl);
try {
const response = await fetch(url.toString());

if (!response.ok) {
return undefined;
}

const htmlContent = await response.text();
return stringToArticle(baseUrl.toString(), htmlContent, subpath);
return stringToArticle(page.url, page.html, page.path);
} catch (err) {
console.error("Error converting URL to article components", err);
return undefined;
Expand Down
274 changes: 112 additions & 162 deletions core/indexing/docs/crawl.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
import { Octokit } from "@octokit/rest";
import cheerio from "cheerio";
import fetch from "node-fetch";
// const HCCrawler = require("headless-chrome-crawler");
import { URL } from "url";

const IGNORE_PATHS_ENDING_IN = [
"favicon.ico",
"robots.txt",
".rst.txt",
// ReadTheDocs
"genindex",
"py-modindex",
"search.html",
Expand All @@ -19,70 +18,6 @@ const IGNORE_PATHS_ENDING_IN = [

const GITHUB_PATHS_TO_TRAVERSE = ["/blob/", "/tree/"];

function shouldFilterPath(pathname: string, baseUrl: URL): boolean {
if (pathname.includes("#")) {
pathname = pathname.slice(0, pathname.indexOf("#"));
}

if (pathname.endsWith("/")) {
pathname = pathname.slice(0, -1);
}

if (baseUrl.hostname === "github.com") {
if (
pathname.split("/").length > 3 &&
!GITHUB_PATHS_TO_TRAVERSE.some((path) => pathname.includes(path))
)
return true;
return false;
}

if (IGNORE_PATHS_ENDING_IN.some((path) => pathname.endsWith(path)))
return true;
return false;
}

async function* crawlLinks(
pathname: string,
baseUrl: URL,
visited: Set<string>,
): AsyncGenerator<number> {
if (visited.has(pathname) || shouldFilterPath(pathname, baseUrl)) {
return;
}
visited.add(pathname);
yield visited.size;

const response = await fetch(new URL(pathname, baseUrl));
const text = await response.text();
const $ = cheerio.load(text);

const children: string[] = [];
$("a").each((_, element) => {
const href = $(element).attr("href");
if (!href) {
return;
}

const parsedUrl = new URL(href, baseUrl);
if (
parsedUrl.hostname === baseUrl.hostname &&
!visited.has(parsedUrl.pathname)
// parsedUrl.pathname.startsWith(baseUrl.pathname)
) {
children.push(parsedUrl.pathname);
}
});

await Promise.all(
children.map(async (child) => {
for await (const _ of crawlLinks(child, baseUrl, visited)) {
}
}),
);
yield visited.size;
}

async function crawlGithubRepo(baseUrl: URL) {
const octokit = new Octokit({
auth: undefined,
Expand All @@ -109,114 +44,129 @@ async function crawlGithubRepo(baseUrl: URL) {
);

const paths = tree.data.tree
.filter(
(file) => file.type === "blob" && file.path?.endsWith(".md"),
// ||
// file.path?.endsWith(".rst") ||
// file.path?.split("/").includes("documentation") ||
// file.path?.split("/").includes("docs") ||
// file.path?.split("/").includes("doc") ||
// file.path?.split("/").includes("examples") ||
// file.path?.split("/").includes("example")
)
.filter((file) => file.type === "blob" && file.path?.endsWith(".md"))
.map((file) => baseUrl.pathname + "/tree/main/" + file.path);

return paths;
}

export async function* crawlSubpages(
baseUrl: URL,
): AsyncGenerator<number, string[]> {
// Special case for GitHub repos
if (baseUrl.hostname === "github.com") {
return crawlGithubRepo(baseUrl);
async function getLinksFromUrl(url: string, path: string) {
const baseUrl = new URL(url);
const location = new URL(path, url);
let response;
try {
response = await fetch(location.toString());
} catch (error: unknown) {
if (error instanceof Error && error.message.includes("maximum redirect")) {
console.error("Maximum redirect reached for: ", location.toString());
return {
html: "",
links: [],
};
} else {
console.error(error);
return {
html: "",
links: [],
};
}
}

// First, check if the parent of the path redirects to the same page
if (baseUrl.pathname.endsWith("/")) {
baseUrl.pathname = baseUrl.pathname.slice(0, -1);
const html = await response.text();
let links: string[] = [];

if (url.includes("github.com")) {
return {
html,
links,
};
}
let realBaseUrl = new URL(baseUrl);
while (true) {
let parentUrl = new URL(realBaseUrl);
parentUrl.pathname = parentUrl.pathname.split("/").slice(0, -1).join("/");

const $ = cheerio.load(html);

$("a").each((_, element) => {
const href = $(element).attr("href");
if (!href) {
return;
}

const parsedUrl = new URL(href, url);
if (
parentUrl.pathname === realBaseUrl.pathname ||
parentUrl.pathname === ""
parsedUrl.hostname === baseUrl.hostname
// parsedUrl.pathname.startsWith(baseUrl.pathname)
) {
break;
links.push(parsedUrl.pathname);
}
const response = await fetch(parentUrl);
const redirected = response.url.toString() === baseUrl.toString() + "/";
if (!redirected) {
break;
}
realBaseUrl = parentUrl;
});

links = [...new Set(links)].filter((link) => {
return (
!link.includes("#") &&
!IGNORE_PATHS_ENDING_IN.some((ending) => link.endsWith(ending))
);
});

return {
html,
links,
};
}

function splitUrl(url: URL) {
const baseUrl = `${url.protocol}//${url.hostname}`;
const basePath = url.pathname;
return {
baseUrl,
basePath,
};
}

export type PageData = {
url: string;
path: string;
html: string;
};

export async function* crawlPage(url: URL): AsyncGenerator<PageData> {
const { baseUrl, basePath } = splitUrl(url);
let paths: string[] = [basePath];

if (url.hostname === "github.com") {
const githubLinks = await crawlGithubRepo(url);
paths = [...paths, ...githubLinks];
}

const visited = new Set<string>();
for await (const count of crawlLinks(
realBaseUrl.pathname || "/",
realBaseUrl,
visited,
)) {
yield count;
let index = 0;

while (index < paths.length) {
const promises = paths
.slice(index, index + 50)
.map((path) => getLinksFromUrl(baseUrl, path));

const results = await Promise.all(promises);

for (const { html, links } of results) {
if (html !== "") {
yield {
url: url.toString(),
path: paths[index],
html: html,
};
}

for (let link of links) {
if (!paths.includes(link)) {
paths.push(link);
}
}

index++;
}

paths = paths.filter((path) =>
results.some(
(result) => result.html !== "" && result.links.includes(path),
),
);
}
return [...visited];
}

// class NoEscapeTurndownService extends TurndownService {
// escape(str: string): string {
// return str;
// }
// }

// async function convertURLToMarkdown(url: string): Promise<string> {
// try {
// const response = await fetch(url);
// const htmlContent = await response.text();
// const turndown = new NoEscapeTurndownService({
// codeBlockStyle: "fenced",
// headingStyle: "atx",
// }).use([turndownPluginGfm.tables, turndownPluginGfm.strikethrough]);
// const markdown = turndown.turndown(htmlContent);
// return markdown;
// } catch (err) {
// console.error(err);
// throw new Error("Error converting URL to markdown");
// }
// }

// convertURLToMarkdown("https://python-socketio.readthedocs.io/en/stable").then(
// (md) => {
// console.log(md);
// }
// );

let visited = new Set<string>();
const url = new URL("https://python-socketio.readthedocs.io/en/stable");
const url2 = new URL("https://platform.openai.com/docs/api-reference");
// crawlLinks(url.pathname, url, visited).then(() => {
// console.log(visited);
// });

// async function hcCrawlLinks() {
// const results: any[] = [];
// const crawler = await HCCrawler.launch({
// // Function to be evaluated in browsers
// evaluatePage: () => ({
// title: $("title").text(),
// }),
// // Function to be called with evaluated results from browsers
// onSuccess: (result: any) => {
// console.log(result);
// results.push(result.url);
// },
// });
// // Queue a request
// await crawler.queue(url.toString());
// await crawler.onIdle(); // Resolved when no queue is left
// await crawler.close(); // Close the crawler

// return results;
// }
6 changes: 6 additions & 0 deletions core/indexing/docs/db.ts
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,9 @@ export async function listDocs(): Promise<
const docs = db.all(`SELECT title, baseUrl FROM docs`);
return docs;
}

export async function hasDoc(baseUrl: string) {
const db = await getDBDocs();
const doc = await db.get(`SELECT title FROM docs WHERE baseUrl =?`, baseUrl);
return!!doc;
}
Loading

0 comments on commit 864c34e

Please sign in to comment.