continue/core/indexing/docs/crawl.ts

213 lines
4.7 KiB
TypeScript

import { Octokit } from "@octokit/rest";
import cheerio from "cheerio";
import fetch from "node-fetch";
import { URL } from "node:url";
const IGNORE_PATHS_ENDING_IN = [
"favicon.ico",
"robots.txt",
".rst.txt",
"genindex",
"py-modindex",
"search.html",
"search",
"genindex.html",
"changelog",
"changelog.html",
];
const markdownRegex = new RegExp(/\.(md|mdx)$/);
async function getDefaultBranch(owner: string, repo: string): Promise<string> {
const octokit = new Octokit({ auth: undefined });
const repoInfo = await octokit.repos.get({
owner,
repo,
});
return repoInfo.data.default_branch;
}
async function crawlGithubRepo(baseUrl: URL) {
const octokit = new Octokit({
auth: undefined,
});
const [_, owner, repo] = baseUrl.pathname.split("/");
const branch = await getDefaultBranch(owner, repo);
console.log("Github repo detected. Crawling", branch, "branch");
const tree = await octokit.request(
"GET /repos/{owner}/{repo}/git/trees/{tree_sha}",
{
owner,
repo,
tree_sha: branch,
headers: {
"X-GitHub-Api-Version": "2022-11-28",
},
recursive: "true",
},
);
const paths = tree.data.tree
.filter(
(file: any) =>
file.type === "blob" && markdownRegex.test(file.path ?? ""),
)
.map((file: any) => baseUrl.pathname + "/tree/main/" + file.path);
return paths;
}
async function getLinksFromUrl(url: string, path: string) {
const baseUrl = new URL(url);
const location = new URL(path, url);
let response;
try {
response = await fetch(location.toString());
} catch (error: unknown) {
if (error instanceof Error && error.message.includes("maximum redirect")) {
console.error("Maximum redirect reached for: ", location.toString());
return {
html: "",
links: [],
};
}
console.error(error);
return {
html: "",
links: [],
};
}
const html = await response.text();
let links: string[] = [];
if (url.includes("github.com")) {
return {
html,
links,
};
}
const $ = cheerio.load(html);
$("a").each((_, element) => {
const href = $(element).attr("href");
if (!href) {
return;
}
const parsedUrl = new URL(href, url);
if (
parsedUrl.hostname === baseUrl.hostname
// parsedUrl.pathname.startsWith(baseUrl.pathname)
) {
links.push(parsedUrl.pathname);
}
});
links = [...new Set(links)].filter((link) => {
return (
!link.includes("#") &&
!IGNORE_PATHS_ENDING_IN.some((ending) => link.endsWith(ending))
);
});
return {
html,
links,
};
}
function splitUrl(url: URL) {
const baseUrl = `${url.protocol}//${url.hostname}${
url.port ? ":" + url.port : ""
}`;
const basePath = url.pathname;
return {
baseUrl,
basePath,
};
}
export type PageData = {
url: string;
path: string;
html: string;
};
export async function* crawlPage(
url: URL,
maxDepth: number = 3,
): AsyncGenerator<PageData> {
console.log(
`Starting crawl from: ${url.toString()} - Max Depth: ${maxDepth}`,
);
const { baseUrl, basePath } = splitUrl(url);
let paths: { path: string; depth: number }[] = [{ path: basePath, depth: 0 }];
if (url.hostname === "github.com") {
const githubLinks = await crawlGithubRepo(url);
const githubLinkObjects = githubLinks.map((link) => ({
path: link,
depth: 0,
}));
paths = [...paths, ...githubLinkObjects];
}
let index = 0;
while (index < paths.length) {
const batch = paths.slice(index, index + 50);
try {
const promises = batch.map(({ path, depth }) =>
getLinksFromUrl(baseUrl, path).then((links) => ({
links,
path,
depth,
})),
); // Adjust for depth tracking
const results = await Promise.all(promises);
for (const {
links: { html, links: linksArray },
path,
depth,
} of results) {
if (html !== "" && depth <= maxDepth) {
// Check depth
yield {
url: url.toString(),
path,
html,
};
}
// Ensure we only add links if within depth limit
if (depth < maxDepth) {
for (let link of linksArray) {
if (!paths.some((p) => p.path === link)) {
paths.push({ path: link, depth: depth + 1 }); // Increment depth for new paths
}
}
}
}
} catch (e) {
if (e instanceof TypeError) {
console.warn("Error while crawling page: ", e); // Likely an invalid url, continue with process
} else {
console.error("Error while crawling page: ", e);
}
}
index += batch.length; // Proceed to next batch
}
console.log("Crawl completed");
}