feat: Add Dimension Mismatch Handling for ChromaDB (#157) (#207)

* Fix Issue #157

Add chroma_utils.py to manage chromaDB and added docs for explanation

* Add logging and debugging process

- Enhanced the`reinitialize_chroma_collections` function in`chroma_utils.py` to properly check if collections exist before attempting to delete them, preventing potential errors when collections don't exist.
- Improved error handling in the`_handle_dimension_mismatch` method in`embedding_service.py` by adding more robust exception handling and verification steps after reinitialization.
- Enhanced the collection initialization process in`embedding_service.py` to provide more detailed error messages and better handle cases where collections still have incorrect dimensions after reinitialization.
- Added additional verification steps to ensure that collection dimensions match the expected dimension after creation or retrieval.
- Improved logging throughout the code to provide more context in error messages, making debugging easier.
This commit is contained in:
Xinghan Pan 2025-04-22 10:27:34 +08:00 committed by GitHub
parent e1ae6f5039
commit 5868f94622
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 396 additions and 35 deletions

View File

@ -1,5 +1,12 @@
import chromadb
import os
import sys
# Add project root to path to import from lpm_kernel
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from lpm_kernel.api.services.user_llm_config_service import UserLLMConfigService
from lpm_kernel.file_data.chroma_utils import detect_embedding_model_dimension, reinitialize_chroma_collections
def init_chroma_db():
chroma_path = os.getenv("CHROMA_PERSIST_DIRECTORY", "./data/chroma_db")
@ -7,36 +14,69 @@ def init_chroma_db():
# ensure the directory is correct
os.makedirs(chroma_path, exist_ok=True)
# Get embedding model dimension from user config
try:
user_llm_config_service = UserLLMConfigService()
user_llm_config = user_llm_config_service.get_available_llm()
if user_llm_config and user_llm_config.embedding_model_name:
# Detect dimension based on model name
dimension = detect_embedding_model_dimension(user_llm_config.embedding_model_name)
print(f"Detected embedding dimension: {dimension} for model: {user_llm_config.embedding_model_name}")
else:
# Default to OpenAI dimension if no config found
dimension = 1536
print(f"No embedding model configured, using default dimension: {dimension}")
except Exception as e:
# Default to OpenAI dimension if error occurs
dimension = 1536
print(f"Error detecting embedding dimension, using default: {dimension}. Error: {e}")
try:
client = chromadb.PersistentClient(path=chroma_path)
collections_to_init = ["documents", "document_chunks"]
dimension_mismatch_detected = False
# collection: init documents level
try:
documents_collection = client.get_collection(name="documents")
print(f"Collection 'documents' already exists")
except ValueError:
documents_collection = client.create_collection(
name="documents",
metadata={
"hnsw:space": "cosine",
"dimension": 1536
}
)
print(f"Successfully created collection 'documents'")
# collection: init chunk level
try:
chunks_collection = client.get_collection(name="document_chunks")
print(f"Collection 'document_chunks' already exists")
except ValueError:
chunks_collection = client.create_collection(
name="document_chunks",
metadata={
"hnsw:space": "cosine",
"dimension": 1536
}
)
print(f"Successfully created collection 'document_chunks'")
# Check all collections for dimension mismatches first
for collection_name in collections_to_init:
try:
collection = client.get_collection(name=collection_name)
print(f"Collection '{collection_name}' already exists")
# Check if existing collection has the correct dimension
if collection.metadata.get("dimension") != dimension:
print(f"Warning: Existing '{collection_name}' collection has dimension {collection.metadata.get('dimension')}, but current model requires {dimension}")
dimension_mismatch_detected = True
except ValueError:
# Collection doesn't exist yet, will be created later
pass
# Handle dimension mismatch if detected in any collection
if dimension_mismatch_detected:
print("Automatically reinitializing ChromaDB collections with the new dimension...")
if reinitialize_chroma_collections(dimension):
print("Successfully reinitialized ChromaDB collections with the new dimension")
else:
print("Failed to reinitialize ChromaDB collections, you may need to manually delete the data/chroma_db directory")
# Create or get collections with the correct dimension
for collection_name in collections_to_init:
try:
collection = client.get_collection(name=collection_name)
# Verify dimension after possible reinitialization
if collection.metadata.get("dimension") != dimension:
print(f"Error: Collection '{collection_name}' still has incorrect dimension after reinitialization: {collection.metadata.get('dimension')} vs {dimension}")
except ValueError:
# Create collection if it doesn't exist
collection = client.create_collection(
name=collection_name,
metadata={
"hnsw:space": "cosine",
"dimension": dimension
}
)
print(f"Successfully created collection '{collection_name}' with dimension {dimension}")
print(f"ChromaDB initialized at {chroma_path}")
except Exception as e:

View File

@ -0,0 +1,61 @@
# Embedding Model Switching Guide
## Understanding Embedding Dimensions
When using different embedding models (like switching from OpenAI to Ollama models), you may encounter dimension mismatch issues. This happens because different models produce embedding vectors with different dimensions:
| Model | Dimension |
|-------|----------|
| OpenAI text-embedding-ada-002 | 1536 |
| OpenAI text-embedding-3-small | 1536 |
| OpenAI text-embedding-3-large | 3072 |
| Ollama snowflake-arctic-embed | 768 |
| Ollama nomic-embed-text | 768 |
| Ollama mxbai-embed-large | 1024 |
## Handling Dimension Mismatches
Second Me now includes automatic detection and handling of embedding dimension mismatches. When you switch between embedding models with different dimensions, the system will:
1. Detect the dimension of the new embedding model
2. Check if the existing ChromaDB collections have a different dimension
3. If a mismatch is detected, automatically reinitialize the collections with the new dimension
4. Provide clear error messages and logging information about the process
## Recommended Workflow for Switching Models
When switching between embedding models with different dimensions, follow these steps:
1. Update your embedding model configuration in Settings
2. Restart the application to ensure proper initialization
3. If you encounter any issues, you can manually reset the vector database:
- Delete the contents of the `data/chroma_db` directory
- Restart the application
## Troubleshooting
The system now automatically handles dimension mismatches when switching between embedding models. You'll see log messages like:
```
Warning: Existing 'documents' collection has dimension X, but current model requires Y
Automatically reinitializing ChromaDB collections with the new dimension...
Successfully reinitialized ChromaDB collections with the new dimension
```
This indicates that the system has detected and resolved a dimension mismatch automatically. If you still encounter issues after the automatic handling:
1. Check the application logs for any error messages
2. If problems persist, you can manually reset the vector database:
- Stop the application
- Delete the contents of the `data/chroma_db` directory
- Restart the application
## Technical Details
The dimension mismatch handling is implemented in:
- `lpm_kernel/file_data/chroma_utils.py`: Contains utilities for detecting model dimensions and reinitializing collections
- `lpm_kernel/file_data/embedding_service.py`: Handles dimension checking during initialization
- `docker/app/init_chroma.py`: Performs dimension validation during initial setup
The system maintains a mapping of known embedding models to their dimensions and will default to 1536 (OpenAI's dimension) for unknown models.

View File

@ -0,0 +1,155 @@
from typing import Optional, Dict, Any, List, Tuple
import os
import chromadb
import logging
from lpm_kernel.configs.logging import get_train_process_logger
logger = get_train_process_logger()
def get_embedding_dimension(embedding: List[float]) -> int:
"""
Get the dimension of an embedding vector
Args:
embedding: The embedding vector
Returns:
The dimension of the embedding vector
"""
return len(embedding)
def detect_embedding_model_dimension(model_name: str) -> Optional[int]:
"""
Detect the dimension of an embedding model based on its name
This is a fallback method when we can't get a sample embedding
Args:
model_name: The name of the embedding model
Returns:
The dimension of the embedding model, or None if unknown
"""
# Common embedding model dimensions
model_dimensions = {
# OpenAI models
"text-embedding-ada-002": 1536,
"text-embedding-3-small": 1536,
"text-embedding-3-large": 3072,
# Ollama models
"snowflake-arctic-embed": 768,
"snowflake-arctic-embed:110m": 768,
"nomic-embed-text": 768,
"nomic-embed-text:v1.5": 768,
"mxbai-embed-large": 1024,
"mxbai-embed-large:v1": 1024,
}
# Try to find exact match
if model_name in model_dimensions:
return model_dimensions[model_name]
# Try to find partial match
for model, dimension in model_dimensions.items():
if model in model_name:
return dimension
# Default to OpenAI dimension if unknown
logger.warning(f"Unknown embedding model: {model_name}, defaulting to 1536 dimensions")
return 1536
def reinitialize_chroma_collections(dimension: int = 1536) -> bool:
"""
Reinitialize ChromaDB collections with a new dimension
Args:
dimension: The new dimension for the collections
Returns:
True if successful, False otherwise
"""
try:
chroma_path = os.getenv("CHROMA_PERSIST_DIRECTORY", "./data/chroma_db")
client = chromadb.PersistentClient(path=chroma_path)
# Delete and recreate document collection
try:
# Check if collection exists before attempting to delete
try:
client.get_collection(name="documents")
client.delete_collection(name="documents")
logger.info("Deleted 'documents' collection")
except ValueError:
logger.info("'documents' collection does not exist, will create new")
except Exception as e:
logger.error(f"Error deleting 'documents' collection: {str(e)}", exc_info=True)
return False
# Create document collection with new dimension
try:
client.create_collection(
name="documents",
metadata={
"hnsw:space": "cosine",
"dimension": dimension
}
)
logger.info(f"Created 'documents' collection with dimension {dimension}")
except Exception as e:
logger.error(f"Error creating 'documents' collection: {str(e)}", exc_info=True)
return False
# Delete and recreate chunk collection
try:
# Check if collection exists before attempting to delete
try:
client.get_collection(name="document_chunks")
client.delete_collection(name="document_chunks")
logger.info("Deleted 'document_chunks' collection")
except ValueError:
logger.info("'document_chunks' collection does not exist, will create new")
except Exception as e:
logger.error(f"Error deleting 'document_chunks' collection: {str(e)}", exc_info=True)
return False
# Create chunk collection with new dimension
try:
client.create_collection(
name="document_chunks",
metadata={
"hnsw:space": "cosine",
"dimension": dimension
}
)
logger.info(f"Created 'document_chunks' collection with dimension {dimension}")
except Exception as e:
logger.error(f"Error creating 'document_chunks' collection: {str(e)}", exc_info=True)
return False
# Verify collections were created with correct dimension
try:
doc_collection = client.get_collection(name="documents")
chunk_collection = client.get_collection(name="document_chunks")
doc_dimension = doc_collection.metadata.get("dimension")
if doc_dimension != dimension:
logger.error(f"Verification failed: 'documents' collection has incorrect dimension: {doc_dimension} vs {dimension}")
return False
chunk_dimension = chunk_collection.metadata.get("dimension")
if chunk_dimension != dimension:
logger.error(f"Verification failed: 'document_chunks' collection has incorrect dimension: {chunk_dimension} vs {dimension}")
return False
logger.info(f"Verification successful: Both collections have correct dimension: {dimension}")
except Exception as e:
logger.error(f"Error verifying collections: {str(e)}", exc_info=True)
return False
return True
except Exception as e:
logger.error(f"Error reinitializing ChromaDB collections: {str(e)}", exc_info=True)
return False

View File

@ -12,19 +12,85 @@ logger = get_train_process_logger()
class EmbeddingService:
def __init__(self):
from lpm_kernel.file_data.chroma_utils import detect_embedding_model_dimension
from lpm_kernel.api.services.user_llm_config_service import UserLLMConfigService
chroma_path = os.getenv("CHROMA_PERSIST_DIRECTORY", "./data/chroma_db")
self.client = chromadb.PersistentClient(path=chroma_path)
self.llm_client = LLMClient()
# Get embedding model dimension from user config
try:
user_llm_config_service = UserLLMConfigService()
user_llm_config = user_llm_config_service.get_available_llm()
if user_llm_config and user_llm_config.embedding_model_name:
# Detect dimension based on model name
self.dimension = detect_embedding_model_dimension(user_llm_config.embedding_model_name)
logger.info(f"Detected embedding dimension: {self.dimension} for model: {user_llm_config.embedding_model_name}")
else:
# Default to OpenAI dimension if no config found
self.dimension = 1536
logger.info(f"No embedding model configured, using default dimension: {self.dimension}")
except Exception as e:
# Default to OpenAI dimension if error occurs
self.dimension = 1536
logger.error(f"Error detecting embedding dimension, using default: {self.dimension}. Error: {str(e)}", exc_info=True)
# document level collection
self.document_collection = self.client.get_or_create_collection(
name="documents", metadata={"hnsw:space": "cosine", "dimension": 1536}
)
# Check for dimension mismatches in all collections first
collections_to_init = ["documents", "document_chunks"]
dimension_mismatch_detected = False
# First pass: check all collections for dimension mismatches
for collection_name in collections_to_init:
try:
collection = self.client.get_collection(name=collection_name)
if collection.metadata.get("dimension") != self.dimension:
logger.warning(f"Dimension mismatch in '{collection_name}' collection: {collection.metadata.get('dimension')} vs {self.dimension}")
dimension_mismatch_detected = True
except ValueError:
# Collection doesn't exist yet, will be created later
pass
# Handle dimension mismatch if detected in any collection
if dimension_mismatch_detected:
self._handle_dimension_mismatch()
# Second pass: create or get collections with the correct dimension
try:
self.document_collection = self.client.get_collection(name="documents")
# Verify dimension after possible reinitialization
doc_dimension = self.document_collection.metadata.get("dimension")
if doc_dimension != self.dimension:
logger.error(f"Collection 'documents' still has incorrect dimension after reinitialization: {doc_dimension} vs {self.dimension}")
# Try to reinitialize again if dimension is still incorrect
raise RuntimeError(f"Failed to set correct dimension for 'documents' collection: {doc_dimension} vs {self.dimension}")
except ValueError:
# Collection doesn't exist, create it with the correct dimension
try:
self.document_collection = self.client.create_collection(
name="documents", metadata={"hnsw:space": "cosine", "dimension": self.dimension}
)
logger.info(f"Created 'documents' collection with dimension {self.dimension}")
except Exception as e:
logger.error(f"Failed to create 'documents' collection: {str(e)}", exc_info=True)
raise RuntimeError(f"Failed to create 'documents' collection: {str(e)}")
# chunk level collection
self.chunk_collection = self.client.get_or_create_collection(
name="document_chunks", metadata={"hnsw:space": "cosine", "dimension": 1536}
)
try:
self.chunk_collection = self.client.get_collection(name="document_chunks")
# Verify dimension after possible reinitialization
chunk_dimension = self.chunk_collection.metadata.get("dimension")
if chunk_dimension != self.dimension:
logger.error(f"Collection 'document_chunks' still has incorrect dimension after reinitialization: {chunk_dimension} vs {self.dimension}")
# Try to reinitialize again if dimension is still incorrect
raise RuntimeError(f"Failed to set correct dimension for 'document_chunks' collection: {chunk_dimension} vs {self.dimension}")
except ValueError:
# Collection doesn't exist, create it with the correct dimension
try:
self.chunk_collection = self.client.create_collection(
name="document_chunks", metadata={"hnsw:space": "cosine", "dimension": self.dimension}
)
logger.info(f"Created 'document_chunks' collection with dimension {self.dimension}")
def generate_document_embedding(self, document: DocumentDTO) -> List[float]:
"""Process document level embedding and store in ChromaDB"""
@ -233,6 +299,45 @@ class EmbeddingService:
)
raise
def _handle_dimension_mismatch(self):
"""
Handle dimension mismatch between current embedding model and ChromaDB collections
This method will reinitialize ChromaDB collections with the new dimension
"""
from lpm_kernel.file_data.chroma_utils import reinitialize_chroma_collections
logger.warning(f"Detected dimension mismatch in ChromaDB collections. Reinitializing with dimension {self.dimension}")
# Log the operation for better debugging
logger.info(f"Calling reinitialize_chroma_collections with dimension {self.dimension}")
try:
success = reinitialize_chroma_collections(self.dimension)
if success:
logger.info(f"Successfully reinitialized ChromaDB collections with dimension {self.dimension}")
# Refresh collection references
try:
self.document_collection = self.client.get_collection(name="documents")
self.chunk_collection = self.client.get_collection(name="document_chunks")
# Double-check dimensions after refresh
doc_dimension = self.document_collection.metadata.get("dimension")
chunk_dimension = self.chunk_collection.metadata.get("dimension")
if doc_dimension != self.dimension or chunk_dimension != self.dimension:
logger.error(f"Dimension mismatch after refresh: documents={doc_dimension}, chunks={chunk_dimension}, expected={self.dimension}")
raise RuntimeError(f"Failed to handle dimension mismatch: collections have incorrect dimensions after reinitialization")
except Exception as e:
logger.error(f"Error refreshing collection references: {str(e)}", exc_info=True)
raise RuntimeError(f"Failed to refresh ChromaDB collections after reinitialization: {str(e)}")
else:
logger.error("Failed to reinitialize ChromaDB collections")
raise RuntimeError("Failed to handle dimension mismatch in ChromaDB collections")
except Exception as e:
logger.error(f"Error during dimension mismatch handling: {str(e)}", exc_info=True)
raise RuntimeError(f"Failed to handle dimension mismatch in ChromaDB collections: {str(e)}")
def search_similar_chunks(
self, query: str, limit: int = 5
) -> List[Tuple[ChunkDTO, float]]: