Merge branch 'rootedbox-refactor_webcrawling' into preview

continuedev · Mar 17, 2024 · 864c34e · 864c34e
2 parents 4f7d8a8 + b81946d
commit 864c34e
Show file tree

Hide file tree

Showing 7 changed files with 165 additions and 212 deletions.
diff --git a/core/indexing/docs/article.ts b/core/indexing/docs/article.ts
@@ -2,13 +2,14 @@ import { Readability } from "@mozilla/readability";
 import { Chunk } from "../..";
 import { MAX_CHUNK_SIZE } from "../../llm/constants";
 import { cleanFragment, cleanHeader } from "../chunk/markdown";
+import { PageData } from "./crawl"
 
-type ArticleComponent = {
+export type ArticleComponent = {
   title: string;
   body: string;
 };
 
-type Article = {
+export type Article = {
   url: string;
   subpath: string;
   title: string;
@@ -132,6 +133,7 @@ export async function stringToArticle(
     }
 
     let article_components = await extractTitlesAndBodies(article.content);
+
     return {
       url,
       subpath,
@@ -144,20 +146,11 @@ export async function stringToArticle(
   }
 }
 
-export async function urlToArticle(
-  subpath: string,
-  baseUrl: URL,
+export async function pageToArticle(
+  page: PageData
 ): Promise<Article | undefined> {
-  const url = new URL(subpath, baseUrl);
   try {
-    const response = await fetch(url.toString());
-
-    if (!response.ok) {
-      return undefined;
-    }
-
-    const htmlContent = await response.text();
-    return stringToArticle(baseUrl.toString(), htmlContent, subpath);
+    return stringToArticle(page.url, page.html, page.path);
   } catch (err) {
     console.error("Error converting URL to article components", err);
     return undefined;

diff --git a/core/indexing/docs/crawl.ts b/core/indexing/docs/crawl.ts
@@ -1,13 +1,12 @@
 import { Octokit } from "@octokit/rest";
 import cheerio from "cheerio";
 import fetch from "node-fetch";
-// const HCCrawler = require("headless-chrome-crawler");
+import { URL } from "url";
 
 const IGNORE_PATHS_ENDING_IN = [
   "favicon.ico",
   "robots.txt",
   ".rst.txt",
-  // ReadTheDocs
   "genindex",
   "py-modindex",
   "search.html",
@@ -19,70 +18,6 @@ const IGNORE_PATHS_ENDING_IN = [
 
 const GITHUB_PATHS_TO_TRAVERSE = ["/blob/", "/tree/"];
 
-function shouldFilterPath(pathname: string, baseUrl: URL): boolean {
-  if (pathname.includes("#")) {
-    pathname = pathname.slice(0, pathname.indexOf("#"));
-  }
-
-  if (pathname.endsWith("/")) {
-    pathname = pathname.slice(0, -1);
-  }
-
-  if (baseUrl.hostname === "github.com") {
-    if (
-      pathname.split("/").length > 3 &&
-      !GITHUB_PATHS_TO_TRAVERSE.some((path) => pathname.includes(path))
-    )
-      return true;
-    return false;
-  }
-
-  if (IGNORE_PATHS_ENDING_IN.some((path) => pathname.endsWith(path)))
-    return true;
-  return false;
-}
-
-async function* crawlLinks(
-  pathname: string,
-  baseUrl: URL,
-  visited: Set<string>,
-): AsyncGenerator<number> {
-  if (visited.has(pathname) || shouldFilterPath(pathname, baseUrl)) {
-    return;
-  }
-  visited.add(pathname);
-  yield visited.size;
-
-  const response = await fetch(new URL(pathname, baseUrl));
-  const text = await response.text();
-  const $ = cheerio.load(text);
-
-  const children: string[] = [];
-  $("a").each((_, element) => {
-    const href = $(element).attr("href");
-    if (!href) {
-      return;
-    }
-
-    const parsedUrl = new URL(href, baseUrl);
-    if (
-      parsedUrl.hostname === baseUrl.hostname &&
-      !visited.has(parsedUrl.pathname)
-      // parsedUrl.pathname.startsWith(baseUrl.pathname)
-    ) {
-      children.push(parsedUrl.pathname);
-    }
-  });
-
-  await Promise.all(
-    children.map(async (child) => {
-      for await (const _ of crawlLinks(child, baseUrl, visited)) {
-      }
-    }),
-  );
-  yield visited.size;
-}
-
 async function crawlGithubRepo(baseUrl: URL) {
   const octokit = new Octokit({
     auth: undefined,
@@ -109,114 +44,129 @@ async function crawlGithubRepo(baseUrl: URL) {
   );
 
   const paths = tree.data.tree
-    .filter(
-      (file) => file.type === "blob" && file.path?.endsWith(".md"),
-      // ||
-      // file.path?.endsWith(".rst") ||
-      // file.path?.split("/").includes("documentation") ||
-      // file.path?.split("/").includes("docs") ||
-      // file.path?.split("/").includes("doc") ||
-      // file.path?.split("/").includes("examples") ||
-      // file.path?.split("/").includes("example")
-    )
+    .filter((file) => file.type === "blob" && file.path?.endsWith(".md"))
     .map((file) => baseUrl.pathname + "/tree/main/" + file.path);
 
   return paths;
 }
 
-export async function* crawlSubpages(
-  baseUrl: URL,
-): AsyncGenerator<number, string[]> {
-  // Special case for GitHub repos
-  if (baseUrl.hostname === "github.com") {
-    return crawlGithubRepo(baseUrl);
+async function getLinksFromUrl(url: string, path: string) {
+  const baseUrl = new URL(url);
+  const location = new URL(path, url);
+  let response;
+  try {
+    response = await fetch(location.toString());
+  } catch (error: unknown) {
+    if (error instanceof Error && error.message.includes("maximum redirect")) {
+      console.error("Maximum redirect reached for: ", location.toString());
+      return {
+        html: "",
+        links: [],
+      };
+    } else {
+      console.error(error);
+      return {
+        html: "",
+        links: [],
+      };
+    }
   }
 
-  // First, check if the parent of the path redirects to the same page
-  if (baseUrl.pathname.endsWith("/")) {
-    baseUrl.pathname = baseUrl.pathname.slice(0, -1);
+  const html = await response.text();
+  let links: string[] = [];
+
+  if (url.includes("github.com")) {
+    return {
+      html,
+      links,
+    };
   }
-  let realBaseUrl = new URL(baseUrl);
-  while (true) {
-    let parentUrl = new URL(realBaseUrl);
-    parentUrl.pathname = parentUrl.pathname.split("/").slice(0, -1).join("/");
+
+  const $ = cheerio.load(html);
+
+  $("a").each((_, element) => {
+    const href = $(element).attr("href");
+    if (!href) {
+      return;
+    }
+
+    const parsedUrl = new URL(href, url);
     if (
-      parentUrl.pathname === realBaseUrl.pathname ||
-      parentUrl.pathname === ""
+      parsedUrl.hostname === baseUrl.hostname
+      // parsedUrl.pathname.startsWith(baseUrl.pathname)
     ) {
-      break;
+      links.push(parsedUrl.pathname);
     }
-    const response = await fetch(parentUrl);
-    const redirected = response.url.toString() === baseUrl.toString() + "/";
-    if (!redirected) {
-      break;
-    }
-    realBaseUrl = parentUrl;
+  });
+
+  links = [...new Set(links)].filter((link) => {
+    return (
+      !link.includes("#") &&
+      !IGNORE_PATHS_ENDING_IN.some((ending) => link.endsWith(ending))
+    );
+  });
+
+  return {
+    html,
+    links,
+  };
+}
+
+function splitUrl(url: URL) {
+  const baseUrl = `${url.protocol}//${url.hostname}`;
+  const basePath = url.pathname;
+  return {
+    baseUrl,
+    basePath,
+  };
+}
+
+export type PageData = {
+  url: string;
+  path: string;
+  html: string;
+};
+
+export async function* crawlPage(url: URL): AsyncGenerator<PageData> {
+  const { baseUrl, basePath } = splitUrl(url);
+  let paths: string[] = [basePath];
+
+  if (url.hostname === "github.com") {
+    const githubLinks = await crawlGithubRepo(url);
+    paths = [...paths, ...githubLinks];
   }
 
-  const visited = new Set<string>();
-  for await (const count of crawlLinks(
-    realBaseUrl.pathname || "/",
-    realBaseUrl,
-    visited,
-  )) {
-    yield count;
+  let index = 0;
+
+  while (index < paths.length) {
+    const promises = paths
+      .slice(index, index + 50)
+      .map((path) => getLinksFromUrl(baseUrl, path));
+
+    const results = await Promise.all(promises);
+
+    for (const { html, links } of results) {
+      if (html !== "") {
+        yield {
+          url: url.toString(),
+          path: paths[index],
+          html: html,
+        };
+      }
+
+      for (let link of links) {
+        if (!paths.includes(link)) {
+          paths.push(link);
+        }
+      }
+
+      index++;
+    }
+
+    paths = paths.filter((path) =>
+      results.some(
+        (result) => result.html !== "" && result.links.includes(path),
+      ),
+    );
   }
-  return [...visited];
 }
-
-// class NoEscapeTurndownService extends TurndownService {
-//   escape(str: string): string {
-//     return str;
-//   }
-// }
-
-// async function convertURLToMarkdown(url: string): Promise<string> {
-//   try {
-//     const response = await fetch(url);
-//     const htmlContent = await response.text();
-//     const turndown = new NoEscapeTurndownService({
-//       codeBlockStyle: "fenced",
-//       headingStyle: "atx",
-//     }).use([turndownPluginGfm.tables, turndownPluginGfm.strikethrough]);
-//     const markdown = turndown.turndown(htmlContent);
-//     return markdown;
-//   } catch (err) {
-//     console.error(err);
-//     throw new Error("Error converting URL to markdown");
-//   }
-// }
-
-// convertURLToMarkdown("https://python-socketio.readthedocs.io/en/stable").then(
-//   (md) => {
-//     console.log(md);
-//   }
-// );
-
-let visited = new Set<string>();
-const url = new URL("https://python-socketio.readthedocs.io/en/stable");
-const url2 = new URL("https://platform.openai.com/docs/api-reference");
-// crawlLinks(url.pathname, url, visited).then(() => {
-//   console.log(visited);
-// });
-
-// async function hcCrawlLinks() {
-//   const results: any[] = [];
-//   const crawler = await HCCrawler.launch({
-//     // Function to be evaluated in browsers
-//     evaluatePage: () => ({
-//       title: $("title").text(),
-//     }),
-//     // Function to be called with evaluated results from browsers
-//     onSuccess: (result: any) => {
-//       console.log(result);
-//       results.push(result.url);
-//     },
-//   });
-//   // Queue a request
-//   await crawler.queue(url.toString());
-//   await crawler.onIdle(); // Resolved when no queue is left
-//   await crawler.close(); // Close the crawler
-
-//   return results;
-// }
diff --git a/core/indexing/docs/db.ts b/core/indexing/docs/db.ts
@@ -140,3 +140,9 @@ export async function listDocs(): Promise<
   const docs = db.all(`SELECT title, baseUrl FROM docs`);
   return docs;
 }
+
+export async function hasDoc(baseUrl: string) {
+  const db = await getDBDocs();
+  const doc = await db.get(`SELECT title FROM docs WHERE baseUrl =?`, baseUrl);
+  return!!doc;
+}