🚧 ChunkCodebaseIndex

This commit is contained in:
Nate Sesti 2024-01-28 17:47:01 -08:00
parent 11cf98d255
commit 1d289383c6
5 changed files with 141 additions and 10 deletions

View File

@ -1,2 +1,3 @@
**/*.run.xml
archive/**/*
archive/**/*
extensions/vscode/models/**/*

View File

@ -6,7 +6,7 @@ import { MAX_CHUNK_SIZE } from "../llm/constants";
import { getBasename } from "../util";
import { getLanceDbPath } from "../util/paths";
import { chunkDocument } from "./chunk/chunk";
import { DatabaseConnection, SqliteDb } from "./refreshIndex";
import { DatabaseConnection, SqliteDb, tagToString } from "./refreshIndex";
import {
CodebaseIndex,
IndexResultType,
@ -15,10 +15,6 @@ import {
RefreshIndexResults,
} from "./types";
export function tagToString(tag: IndexTag): string {
return `${tag.directory}::${tag.branch}::${tag.artifactId}`;
}
// LanceDB converts to lowercase, so names must all be lowercase
interface LanceDbRow {
uuid: string;

View File

@ -0,0 +1,127 @@
import { IndexingProgressUpdate } from "../..";
import { MAX_CHUNK_SIZE } from "../../llm/constants";
import { getBasename } from "../../util";
import { DatabaseConnection, SqliteDb, tagToString } from "../refreshIndex";
import {
CodebaseIndex,
IndexResultType,
IndexTag,
MarkCompleteCallback,
RefreshIndexResults,
} from "../types";
import { chunkDocument } from "./chunk";
export class ChunkCodebaseIndex implements CodebaseIndex {
artifactId: string = "chunks";
readFile: (filepath: string) => Promise<string>;
constructor(readFile: (filepath: string) => Promise<string>) {
this.readFile = readFile;
}
private async _createTables(db: DatabaseConnection) {
await db.exec(`CREATE TABLE IF NOT EXISTS chunks (
id INTEGER PRIMARY KEY AUTOINCREMENT,
cacheKey TEXT NOT NULL,
path TEXT NOT NULL,
idx INTEGER NOT NULL,
startLine INTEGER NOT NULL,
endLine INTEGER NOT NULL,
content TEXT NOT NULL
)`);
await db.exec(`CREATE TABLE IF NOT EXISTS chunk_tags (
id INTEGER PRIMARY KEY AUTOINCREMENT,
tag TEXT NOT NULL,
chunkId INTEGER NOT NULL,
FOREIGN KEY (chunkId) REFERENCES chunks (id)
)`);
}
async *update(
tag: IndexTag,
results: RefreshIndexResults,
markComplete: MarkCompleteCallback
): AsyncGenerator<IndexingProgressUpdate, any, unknown> {
const db = await SqliteDb.get();
await this._createTables(db);
const tagString = tagToString(tag);
// Compute chunks for new files
const contents = await Promise.all(
results.compute.map(({ path }) => this.readFile(path))
);
for (let i = 0; i < results.compute.length; i++) {
const item = results.compute[i];
// Insert chunks
for await (let chunk of chunkDocument(
item.path,
contents[i],
MAX_CHUNK_SIZE,
item.cacheKey
)) {
const { lastID } = await db.run(
`INSERT INTO chunks (cacheKey, path, idx, startLine, endLine, content) VALUES (?, ?, ?, ?, ?, ?)`,
[
chunk.digest,
chunk.filepath,
chunk.index,
chunk.startLine,
chunk.endLine,
chunk.content,
]
);
await db.run(`INSERT INTO chunk_tags (chunkId, tag) VALUES (?, ?)`, [
lastID,
tagString,
]);
}
yield {
progress: i / results.compute.length,
desc: `Chunking ${getBasename(item.path)}`,
};
markComplete([item], IndexResultType.Compute);
i++;
}
// Add tag
for (const item of results.addTag) {
const chunksWithPath = await db.all(
`SELECT * FROM chunks WHERE cacheKey = ?`,
[item.cacheKey]
);
for (const chunk of chunksWithPath) {
await db.run(`INSERT INTO chunk_tags (chunkId, tag) VALUES (?, ?)`, [
chunk.id,
tagString,
]);
}
markComplete([item], IndexResultType.AddTag);
}
// Remove tag
for (const item of results.removeTag) {
await db.run(`DELETE FROM chunk_tags WHERE tag = ?`, [tagString]);
markComplete([item], IndexResultType.RemoveTag);
}
// Delete
for (const item of results.del) {
const deleted = await db.run(`DELETE FROM chunks WHERE cacheKey = ?`, [
item.cacheKey,
]);
// Delete from chunk_tags
await db.run(`DELETE FROM chunk_tags WHERE chunkId = ?`, [
deleted.lastID,
]);
markComplete([item], IndexResultType.Delete);
}
}
}

View File

@ -15,6 +15,10 @@ import {
export type DatabaseConnection = Database<sqlite3.Database>;
export function tagToString(tag: IndexTag): string {
return `${tag.directory}::${tag.branch}::${tag.artifactId}`;
}
export class SqliteDb {
static db: DatabaseConnection | null = null;

View File

@ -1,4 +1,5 @@
import { LanceDbIndex } from "core/indexing/LanceDbIndex";
import { ChunkCodebaseIndex } from "core/indexing/chunk/ChunkCodebaseIndex";
import { getComputeDeleteAddRemove } from "core/indexing/refreshIndex";
import { CodebaseIndex, IndexTag, LastModifiedMap } from "core/indexing/types";
import * as vscode from "vscode";
@ -23,11 +24,13 @@ const vscodeGetStats = async (
};
async function getIndexesToBuild(): Promise<CodebaseIndex[]> {
const indexes = [];
const ide = new VsCodeIde();
const config = await configHandler.loadConfig(ide);
indexes.push(new LanceDbIndex(config.embeddingsProvider, ide.readFile));
const indexes = [
new LanceDbIndex(config.embeddingsProvider, ide.readFile),
new ChunkCodebaseIndex(ide.readFile),
];
return indexes;
}
@ -66,7 +69,7 @@ export async function vsCodeIndexCodebase(workspaceDirs: string[]) {
(filepath) => ideProtocolClient.readFile(filepath)
);
// console.log("RESULTS: ", results);
console.log("RESULTS: ", codebaseIndex.artifactId, results);
for await (let { progress, desc } of codebaseIndex.update(
tag,