213 lines
4.7 KiB
TypeScript
213 lines
4.7 KiB
TypeScript
import { Octokit } from "@octokit/rest";
|
|
import cheerio from "cheerio";
|
|
import fetch from "node-fetch";
|
|
import { URL } from "node:url";
|
|
|
|
const IGNORE_PATHS_ENDING_IN = [
|
|
"favicon.ico",
|
|
"robots.txt",
|
|
".rst.txt",
|
|
"genindex",
|
|
"py-modindex",
|
|
"search.html",
|
|
"search",
|
|
"genindex.html",
|
|
"changelog",
|
|
"changelog.html",
|
|
];
|
|
|
|
const markdownRegex = new RegExp(/\.(md|mdx)$/);
|
|
|
|
async function getDefaultBranch(owner: string, repo: string): Promise<string> {
|
|
const octokit = new Octokit({ auth: undefined });
|
|
|
|
const repoInfo = await octokit.repos.get({
|
|
owner,
|
|
repo,
|
|
});
|
|
|
|
return repoInfo.data.default_branch;
|
|
}
|
|
|
|
async function crawlGithubRepo(baseUrl: URL) {
|
|
const octokit = new Octokit({
|
|
auth: undefined,
|
|
});
|
|
|
|
const [_, owner, repo] = baseUrl.pathname.split("/");
|
|
|
|
const branch = await getDefaultBranch(owner, repo);
|
|
console.log("Github repo detected. Crawling", branch, "branch");
|
|
|
|
const tree = await octokit.request(
|
|
"GET /repos/{owner}/{repo}/git/trees/{tree_sha}",
|
|
{
|
|
owner,
|
|
repo,
|
|
tree_sha: branch,
|
|
headers: {
|
|
"X-GitHub-Api-Version": "2022-11-28",
|
|
},
|
|
recursive: "true",
|
|
},
|
|
);
|
|
|
|
const paths = tree.data.tree
|
|
.filter(
|
|
(file: any) =>
|
|
file.type === "blob" && markdownRegex.test(file.path ?? ""),
|
|
)
|
|
.map((file: any) => baseUrl.pathname + "/tree/main/" + file.path);
|
|
|
|
return paths;
|
|
}
|
|
|
|
async function getLinksFromUrl(url: string, path: string) {
|
|
const baseUrl = new URL(url);
|
|
const location = new URL(path, url);
|
|
let response;
|
|
|
|
try {
|
|
response = await fetch(location.toString());
|
|
} catch (error: unknown) {
|
|
if (error instanceof Error && error.message.includes("maximum redirect")) {
|
|
console.error("Maximum redirect reached for: ", location.toString());
|
|
return {
|
|
html: "",
|
|
links: [],
|
|
};
|
|
}
|
|
console.error(error);
|
|
return {
|
|
html: "",
|
|
links: [],
|
|
};
|
|
}
|
|
|
|
const html = await response.text();
|
|
let links: string[] = [];
|
|
|
|
if (url.includes("github.com")) {
|
|
return {
|
|
html,
|
|
links,
|
|
};
|
|
}
|
|
|
|
const $ = cheerio.load(html);
|
|
|
|
$("a").each((_, element) => {
|
|
const href = $(element).attr("href");
|
|
if (!href) {
|
|
return;
|
|
}
|
|
|
|
const parsedUrl = new URL(href, url);
|
|
if (
|
|
parsedUrl.hostname === baseUrl.hostname
|
|
// parsedUrl.pathname.startsWith(baseUrl.pathname)
|
|
) {
|
|
links.push(parsedUrl.pathname);
|
|
}
|
|
});
|
|
|
|
links = [...new Set(links)].filter((link) => {
|
|
return (
|
|
!link.includes("#") &&
|
|
!IGNORE_PATHS_ENDING_IN.some((ending) => link.endsWith(ending))
|
|
);
|
|
});
|
|
|
|
return {
|
|
html,
|
|
links,
|
|
};
|
|
}
|
|
|
|
function splitUrl(url: URL) {
|
|
const baseUrl = `${url.protocol}//${url.hostname}${
|
|
url.port ? ":" + url.port : ""
|
|
}`;
|
|
const basePath = url.pathname;
|
|
return {
|
|
baseUrl,
|
|
basePath,
|
|
};
|
|
}
|
|
|
|
export type PageData = {
|
|
url: string;
|
|
path: string;
|
|
html: string;
|
|
};
|
|
|
|
export async function* crawlPage(
|
|
url: URL,
|
|
maxDepth: number = 3,
|
|
): AsyncGenerator<PageData> {
|
|
console.log(
|
|
`Starting crawl from: ${url.toString()} - Max Depth: ${maxDepth}`,
|
|
);
|
|
|
|
const { baseUrl, basePath } = splitUrl(url);
|
|
let paths: { path: string; depth: number }[] = [{ path: basePath, depth: 0 }];
|
|
|
|
if (url.hostname === "github.com") {
|
|
const githubLinks = await crawlGithubRepo(url);
|
|
const githubLinkObjects = githubLinks.map((link) => ({
|
|
path: link,
|
|
depth: 0,
|
|
}));
|
|
paths = [...paths, ...githubLinkObjects];
|
|
}
|
|
|
|
let index = 0;
|
|
while (index < paths.length) {
|
|
const batch = paths.slice(index, index + 50);
|
|
|
|
try {
|
|
const promises = batch.map(({ path, depth }) =>
|
|
getLinksFromUrl(baseUrl, path).then((links) => ({
|
|
links,
|
|
path,
|
|
depth,
|
|
})),
|
|
); // Adjust for depth tracking
|
|
|
|
const results = await Promise.all(promises);
|
|
for (const {
|
|
links: { html, links: linksArray },
|
|
path,
|
|
depth,
|
|
} of results) {
|
|
if (html !== "" && depth <= maxDepth) {
|
|
// Check depth
|
|
yield {
|
|
url: url.toString(),
|
|
path,
|
|
html,
|
|
};
|
|
}
|
|
|
|
// Ensure we only add links if within depth limit
|
|
if (depth < maxDepth) {
|
|
for (let link of linksArray) {
|
|
if (!paths.some((p) => p.path === link)) {
|
|
paths.push({ path: link, depth: depth + 1 }); // Increment depth for new paths
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} catch (e) {
|
|
if (e instanceof TypeError) {
|
|
console.warn("Error while crawling page: ", e); // Likely an invalid url, continue with process
|
|
} else {
|
|
console.error("Error while crawling page: ", e);
|
|
}
|
|
}
|
|
|
|
index += batch.length; // Proceed to next batch
|
|
}
|
|
console.log("Crawl completed");
|
|
}
|