diff --git a/core/indexing/docs/article.ts b/core/indexing/docs/article.ts index d38dc75fbc..ec9a0c98b5 100644 --- a/core/indexing/docs/article.ts +++ b/core/indexing/docs/article.ts @@ -2,13 +2,14 @@ import { Readability } from "@mozilla/readability"; import { Chunk } from "../.."; import { MAX_CHUNK_SIZE } from "../../llm/constants"; import { cleanFragment, cleanHeader } from "../chunk/markdown"; +import { PageData } from "./crawl" -type ArticleComponent = { +export type ArticleComponent = { title: string; body: string; }; -type Article = { +export type Article = { url: string; subpath: string; title: string; @@ -132,6 +133,7 @@ export async function stringToArticle( } let article_components = await extractTitlesAndBodies(article.content); + return { url, subpath, @@ -144,20 +146,11 @@ export async function stringToArticle( } } -export async function urlToArticle( - subpath: string, - baseUrl: URL, +export async function pageToArticle( + page: PageData ): Promise
{ - const url = new URL(subpath, baseUrl); try { - const response = await fetch(url.toString()); - - if (!response.ok) { - return undefined; - } - - const htmlContent = await response.text(); - return stringToArticle(baseUrl.toString(), htmlContent, subpath); + return stringToArticle(page.url, page.html, page.path); } catch (err) { console.error("Error converting URL to article components", err); return undefined; diff --git a/core/indexing/docs/crawl.ts b/core/indexing/docs/crawl.ts index 247c7d36ea..fdc1e33dc6 100644 --- a/core/indexing/docs/crawl.ts +++ b/core/indexing/docs/crawl.ts @@ -1,13 +1,12 @@ import { Octokit } from "@octokit/rest"; import cheerio from "cheerio"; import fetch from "node-fetch"; -// const HCCrawler = require("headless-chrome-crawler"); +import { URL } from "url"; const IGNORE_PATHS_ENDING_IN = [ "favicon.ico", "robots.txt", ".rst.txt", - // ReadTheDocs "genindex", "py-modindex", "search.html", @@ -19,70 +18,6 @@ const IGNORE_PATHS_ENDING_IN = [ const GITHUB_PATHS_TO_TRAVERSE = ["/blob/", "/tree/"]; -function shouldFilterPath(pathname: string, baseUrl: URL): boolean { - if (pathname.includes("#")) { - pathname = pathname.slice(0, pathname.indexOf("#")); - } - - if (pathname.endsWith("/")) { - pathname = pathname.slice(0, -1); - } - - if (baseUrl.hostname === "github.com") { - if ( - pathname.split("/").length > 3 && - !GITHUB_PATHS_TO_TRAVERSE.some((path) => pathname.includes(path)) - ) - return true; - return false; - } - - if (IGNORE_PATHS_ENDING_IN.some((path) => pathname.endsWith(path))) - return true; - return false; -} - -async function* crawlLinks( - pathname: string, - baseUrl: URL, - visited: Set, -): AsyncGenerator { - if (visited.has(pathname) || shouldFilterPath(pathname, baseUrl)) { - return; - } - visited.add(pathname); - yield visited.size; - - const response = await fetch(new URL(pathname, baseUrl)); - const text = await response.text(); - const $ = cheerio.load(text); - - const children: string[] = []; - $("a").each((_, element) => { - const href = $(element).attr("href"); - if (!href) { - return; - } - - const parsedUrl = new URL(href, baseUrl); - if ( - parsedUrl.hostname === baseUrl.hostname && - !visited.has(parsedUrl.pathname) - // parsedUrl.pathname.startsWith(baseUrl.pathname) - ) { - children.push(parsedUrl.pathname); - } - }); - - await Promise.all( - children.map(async (child) => { - for await (const _ of crawlLinks(child, baseUrl, visited)) { - } - }), - ); - yield visited.size; -} - async function crawlGithubRepo(baseUrl: URL) { const octokit = new Octokit({ auth: undefined, @@ -109,114 +44,129 @@ async function crawlGithubRepo(baseUrl: URL) { ); const paths = tree.data.tree - .filter( - (file) => file.type === "blob" && file.path?.endsWith(".md"), - // || - // file.path?.endsWith(".rst") || - // file.path?.split("/").includes("documentation") || - // file.path?.split("/").includes("docs") || - // file.path?.split("/").includes("doc") || - // file.path?.split("/").includes("examples") || - // file.path?.split("/").includes("example") - ) + .filter((file) => file.type === "blob" && file.path?.endsWith(".md")) .map((file) => baseUrl.pathname + "/tree/main/" + file.path); return paths; } -export async function* crawlSubpages( - baseUrl: URL, -): AsyncGenerator { - // Special case for GitHub repos - if (baseUrl.hostname === "github.com") { - return crawlGithubRepo(baseUrl); +async function getLinksFromUrl(url: string, path: string) { + const baseUrl = new URL(url); + const location = new URL(path, url); + let response; + try { + response = await fetch(location.toString()); + } catch (error: unknown) { + if (error instanceof Error && error.message.includes("maximum redirect")) { + console.error("Maximum redirect reached for: ", location.toString()); + return { + html: "", + links: [], + }; + } else { + console.error(error); + return { + html: "", + links: [], + }; + } } - // First, check if the parent of the path redirects to the same page - if (baseUrl.pathname.endsWith("/")) { - baseUrl.pathname = baseUrl.pathname.slice(0, -1); + const html = await response.text(); + let links: string[] = []; + + if (url.includes("github.com")) { + return { + html, + links, + }; } - let realBaseUrl = new URL(baseUrl); - while (true) { - let parentUrl = new URL(realBaseUrl); - parentUrl.pathname = parentUrl.pathname.split("/").slice(0, -1).join("/"); + + const $ = cheerio.load(html); + + $("a").each((_, element) => { + const href = $(element).attr("href"); + if (!href) { + return; + } + + const parsedUrl = new URL(href, url); if ( - parentUrl.pathname === realBaseUrl.pathname || - parentUrl.pathname === "" + parsedUrl.hostname === baseUrl.hostname + // parsedUrl.pathname.startsWith(baseUrl.pathname) ) { - break; + links.push(parsedUrl.pathname); } - const response = await fetch(parentUrl); - const redirected = response.url.toString() === baseUrl.toString() + "/"; - if (!redirected) { - break; - } - realBaseUrl = parentUrl; + }); + + links = [...new Set(links)].filter((link) => { + return ( + !link.includes("#") && + !IGNORE_PATHS_ENDING_IN.some((ending) => link.endsWith(ending)) + ); + }); + + return { + html, + links, + }; +} + +function splitUrl(url: URL) { + const baseUrl = `${url.protocol}//${url.hostname}`; + const basePath = url.pathname; + return { + baseUrl, + basePath, + }; +} + +export type PageData = { + url: string; + path: string; + html: string; +}; + +export async function* crawlPage(url: URL): AsyncGenerator { + const { baseUrl, basePath } = splitUrl(url); + let paths: string[] = [basePath]; + + if (url.hostname === "github.com") { + const githubLinks = await crawlGithubRepo(url); + paths = [...paths, ...githubLinks]; } - const visited = new Set(); - for await (const count of crawlLinks( - realBaseUrl.pathname || "/", - realBaseUrl, - visited, - )) { - yield count; + let index = 0; + + while (index < paths.length) { + const promises = paths + .slice(index, index + 50) + .map((path) => getLinksFromUrl(baseUrl, path)); + + const results = await Promise.all(promises); + + for (const { html, links } of results) { + if (html !== "") { + yield { + url: url.toString(), + path: paths[index], + html: html, + }; + } + + for (let link of links) { + if (!paths.includes(link)) { + paths.push(link); + } + } + + index++; + } + + paths = paths.filter((path) => + results.some( + (result) => result.html !== "" && result.links.includes(path), + ), + ); } - return [...visited]; } - -// class NoEscapeTurndownService extends TurndownService { -// escape(str: string): string { -// return str; -// } -// } - -// async function convertURLToMarkdown(url: string): Promise { -// try { -// const response = await fetch(url); -// const htmlContent = await response.text(); -// const turndown = new NoEscapeTurndownService({ -// codeBlockStyle: "fenced", -// headingStyle: "atx", -// }).use([turndownPluginGfm.tables, turndownPluginGfm.strikethrough]); -// const markdown = turndown.turndown(htmlContent); -// return markdown; -// } catch (err) { -// console.error(err); -// throw new Error("Error converting URL to markdown"); -// } -// } - -// convertURLToMarkdown("https://python-socketio.readthedocs.io/en/stable").then( -// (md) => { -// console.log(md); -// } -// ); - -let visited = new Set(); -const url = new URL("https://python-socketio.readthedocs.io/en/stable"); -const url2 = new URL("https://platform.openai.com/docs/api-reference"); -// crawlLinks(url.pathname, url, visited).then(() => { -// console.log(visited); -// }); - -// async function hcCrawlLinks() { -// const results: any[] = []; -// const crawler = await HCCrawler.launch({ -// // Function to be evaluated in browsers -// evaluatePage: () => ({ -// title: $("title").text(), -// }), -// // Function to be called with evaluated results from browsers -// onSuccess: (result: any) => { -// console.log(result); -// results.push(result.url); -// }, -// }); -// // Queue a request -// await crawler.queue(url.toString()); -// await crawler.onIdle(); // Resolved when no queue is left -// await crawler.close(); // Close the crawler - -// return results; -// } diff --git a/core/indexing/docs/db.ts b/core/indexing/docs/db.ts index 73651452e9..151b220fb4 100644 --- a/core/indexing/docs/db.ts +++ b/core/indexing/docs/db.ts @@ -140,3 +140,9 @@ export async function listDocs(): Promise< const docs = db.all(`SELECT title, baseUrl FROM docs`); return docs; } + +export async function hasDoc(baseUrl: string) { + const db = await getDBDocs(); + const doc = await db.get(`SELECT title FROM docs WHERE baseUrl =?`, baseUrl); + return!!doc; +} diff --git a/core/indexing/docs/index.ts b/core/indexing/docs/index.ts index 67478a0ceb..89906f0f16 100644 --- a/core/indexing/docs/index.ts +++ b/core/indexing/docs/index.ts @@ -4,17 +4,16 @@ import { IndexingProgressUpdate, } from "../.."; -import { crawlSubpages } from "./crawl"; -import { addDocs, listDocs } from "./db"; -import { urlToArticle, chunkArticle } from "./article"; +import { crawlPage } from "./crawl"; +import { addDocs, hasDoc } from "./db"; +import { pageToArticle, chunkArticle, Article } from "./article"; export async function* indexDocs( title: string, baseUrl: URL, embeddingsProvider: EmbeddingsProvider, ): AsyncGenerator { - const existingDocs = await listDocs(); - if (existingDocs.find((doc) => doc.baseUrl === baseUrl.toString())) { + if (await hasDoc(baseUrl.toString())) { yield { progress: 1, desc: "Already indexed", @@ -27,36 +26,26 @@ export async function* indexDocs( desc: "Finding subpages", }; - const subpathGenerator = crawlSubpages(baseUrl); - let { value, done } = await subpathGenerator.next(); - - while (true) { - if (done) { - break; - } + const articles: Article[] = []; + + for await (const page of crawlPage(baseUrl)) { + const article = await pageToArticle(page); + if (!article) continue; + + articles.push(article); + yield { progress: 0, - desc: `Finding subpages (${value})`, + desc: `Finding subpages (${page.path})`, }; - const next = await subpathGenerator.next(); - value = next.value; - done = next.done; } - let subpaths = value as string[]; - const chunks: Chunk[] = []; const embeddings: number[][] = []; - let articles = await Promise.all( - subpaths.map(subpath => urlToArticle(subpath, baseUrl)), - ); - for (const article of articles) { - if (!article) continue; - yield { - progress: Math.max(1, Math.floor(100 / (subpaths.length + 1))), + progress: Math.max(1, Math.floor(100 / (articles.length + 1))), desc: `${article.subpath}`, }; diff --git a/extensions/vscode/package-lock.json b/extensions/vscode/package-lock.json index be905ca168..e11749691e 100644 --- a/extensions/vscode/package-lock.json +++ b/extensions/vscode/package-lock.json @@ -1,12 +1,12 @@ { "name": "continue", - "version": "0.8.18", + "version": "0.9.88", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "continue", - "version": "0.8.18", + "version": "0.9.88", "license": "Apache-2.0", "dependencies": { "@electron/rebuild": "^3.2.10", @@ -80,6 +80,7 @@ "vsce": "^2.15.0" }, "engines": { + "node": ">=20.11.0", "vscode": "^1.70.0" } }, @@ -134,6 +135,9 @@ "esbuild": "^0.19.11", "jest": "^29.7.0", "ts-jest": "^29.1.1" + }, + "engines": { + "node": ">=20.11.0" } }, "node_modules/@75lb/deep-merge": { diff --git a/extensions/vscode/yarn.lock b/extensions/vscode/yarn.lock index 66932aca88..58e092a54d 100644 --- a/extensions/vscode/yarn.lock +++ b/extensions/vscode/yarn.lock @@ -368,15 +368,15 @@ tar "^6.0.5" yargs "^17.0.1" -"@esbuild/linux-x64@0.17.19": +"@esbuild/darwin-arm64@0.17.19": version "0.17.19" - resolved "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.17.19.tgz" - integrity sha512-68ngA9lg2H6zkZcyp22tsVt38mlhWde8l3eJLWkyLrp4HwMUr3c1s/M2t7+kHIhvMjglIBrFpncX1SzMckomGw== + resolved "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.17.19.tgz" + integrity sha512-80wEoCfF/hFKM6WE1FyBHc9SfUblloAWx6FJkFWTWiCoht9Mc0ARGEM47e67W9rI09YoUxJL68WHfDRYEAvOhg== -"@esbuild/linux-x64@0.18.20": +"@esbuild/darwin-arm64@0.18.20": version "0.18.20" - resolved "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.18.20.tgz" - integrity sha512-UYqiqemphJcNsFEskc73jQ7B9jgwjWrSayxawS6UVFZGWrAAtkzjxSqnoclCXxWtfwLdzU+vTpcNYhpn43uP1w== + resolved "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.18.20.tgz" + integrity sha512-bxRHW5kHU38zS2lPTPOyuyTm+S+eobPUnTNkdJEfAddYgEcll4xkT8DB9d2008DtTbl7uJag2HuE5NZAZgnNEA== "@eslint-community/eslint-utils@^4.2.0": version "4.4.0" @@ -691,10 +691,10 @@ resolved "https://registry.npmjs.org/@jsdevtools/ono/-/ono-7.1.3.tgz" integrity sha512-4JQNk+3mVzK3xh2rqd6RB4J46qUR19azEHBneZyTZM+c456qOrbbM/5xcR8huNCCcbVt7+UmizG6GuUvPvKUYg== -"@lancedb/vectordb-linux-x64-gnu@0.4.12": +"@lancedb/vectordb-darwin-arm64@*", "@lancedb/vectordb-darwin-arm64@0.4.12": version "0.4.12" - resolved "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.12.tgz" - integrity sha512-gJqYR0aymrS+C60xc4EQPzmQ5/69XfeFv2ofBvAj7qW+c6BcnoAcfVl+7s1IrcWeGz251sm5cD5Lx4AzJd89dA== + resolved "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.12.tgz" + integrity sha512-38/rkJRlWXkPWXuj9onzvbrhnIWcIUQjgEp5G9v5ixPosBowm7A4j8e2Q8CJMsVSNcVX2JLqwWVldiWegZFuYw== "@lukeed/csprng@^1.0.0": version "1.1.0" @@ -3444,6 +3444,11 @@ fs.realpath@^1.0.0: resolved "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz" integrity sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw== +fsevents@^2.3.2, fsevents@~2.3.2: + version "2.3.3" + resolved "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz" + integrity sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw== + function-bind@^1.1.2: version "1.1.2" resolved "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz" diff --git a/gui/package-lock.json b/gui/package-lock.json index 7162eef5d5..1a4bab7580 100644 --- a/gui/package-lock.json +++ b/gui/package-lock.json @@ -67,6 +67,9 @@ "tailwindcss": "^3.2.7", "typescript": "^4.9.3", "vite": "^4.1.0" + }, + "engines": { + "node": ">=20.11.0" } }, "../core": { @@ -120,6 +123,9 @@ "esbuild": "^0.19.11", "jest": "^29.7.0", "ts-jest": "^29.1.1" + }, + "engines": { + "node": ">=20.11.0" } }, "node_modules/@alloc/quick-lru": {