continue/core/context/retrieval/pipelines/BaseRetrievalPipeline.ts

100 lines
2.7 KiB
TypeScript

import {
BranchAndDir,
Chunk,
EmbeddingsProvider,
IDE,
Reranker,
} from "../../../index.js";
import { chunkDocument } from "../../../indexing/chunk/chunk.js";
import { LanceDbIndex } from "../../../indexing/LanceDbIndex.js";
import { MAX_CHUNK_SIZE } from "../../../llm/constants.js";
import { retrieveFts } from "../fullTextSearch.js";
import { recentlyEditedFilesCache } from "../recentlyEditedFilesCache.js";
export interface RetrievalPipelineOptions {
ide: IDE;
embeddingsProvider: EmbeddingsProvider;
reranker: Reranker | undefined;
input: string;
nRetrieve: number;
nFinal: number;
tags: BranchAndDir[];
filterDirectory?: string;
}
export interface IRetrievalPipeline {
run(options: RetrievalPipelineOptions): Promise<Chunk[]>;
}
export default class BaseRetrievalPipeline implements IRetrievalPipeline {
private lanceDbIndex: LanceDbIndex;
constructor(protected readonly options: RetrievalPipelineOptions) {
this.lanceDbIndex = new LanceDbIndex(options.embeddingsProvider, (path) =>
options.ide.readFile(path),
);
}
protected async retrieveAndChunkRecentlyEditedFiles(
n: number,
): Promise<Chunk[]> {
const recentlyEditedFilesSlice = Array.from(
recentlyEditedFilesCache.keys(),
).slice(0, n);
// If the number of recently edited files is less than the retrieval limit,
// include additional open files. This is useful in the case where a user
// has many tabs open and reloads their IDE. They now have 0 recently edited files,
// but many open tabs that represent what they were working on prior to reload.
if (recentlyEditedFilesSlice.length < n) {
const openFiles = await this.options.ide.getOpenFiles();
recentlyEditedFilesSlice.push(
...openFiles.slice(0, n - recentlyEditedFilesSlice.length),
);
}
const chunks: Chunk[] = [];
for (const filepath of recentlyEditedFilesSlice) {
const contents = await this.options.ide.readFile(filepath);
const fileChunks = chunkDocument({
filepath,
contents,
maxChunkSize: MAX_CHUNK_SIZE,
digest: filepath,
});
for await (const chunk of fileChunks) {
chunks.push(chunk);
}
}
return chunks;
}
protected async retrieveFts(input: string, n: number): Promise<Chunk[]> {
return retrieveFts(
input,
n,
this.options.tags,
this.options.filterDirectory,
);
}
protected async retrieveEmbeddings(
input: string,
n: number,
): Promise<Chunk[]> {
return this.lanceDbIndex.retrieve(
input,
n,
this.options.tags,
this.options.filterDirectory,
);
}
run(): Promise<Chunk[]> {
throw new Error("Not implemented");
}
}