* Fix Issue #157 Add chroma_utils.py to manage chromaDB and added docs for explanation * Add logging and debugging process - Enhanced the`reinitialize_chroma_collections` function in`chroma_utils.py` to properly check if collections exist before attempting to delete them, preventing potential errors when collections don't exist. - Improved error handling in the`_handle_dimension_mismatch` method in`embedding_service.py` by adding more robust exception handling and verification steps after reinitialization. - Enhanced the collection initialization process in`embedding_service.py` to provide more detailed error messages and better handle cases where collections still have incorrect dimensions after reinitialization. - Added additional verification steps to ensure that collection dimensions match the expected dimension after creation or retrieval. - Improved logging throughout the code to provide more context in error messages, making debugging easier.
This commit is contained in:
parent
e1ae6f5039
commit
5868f94622
|
@ -1,5 +1,12 @@
|
|||
import chromadb
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add project root to path to import from lpm_kernel
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||
|
||||
from lpm_kernel.api.services.user_llm_config_service import UserLLMConfigService
|
||||
from lpm_kernel.file_data.chroma_utils import detect_embedding_model_dimension, reinitialize_chroma_collections
|
||||
|
||||
def init_chroma_db():
|
||||
chroma_path = os.getenv("CHROMA_PERSIST_DIRECTORY", "./data/chroma_db")
|
||||
|
@ -7,36 +14,69 @@ def init_chroma_db():
|
|||
# ensure the directory is correct
|
||||
os.makedirs(chroma_path, exist_ok=True)
|
||||
|
||||
# Get embedding model dimension from user config
|
||||
try:
|
||||
user_llm_config_service = UserLLMConfigService()
|
||||
user_llm_config = user_llm_config_service.get_available_llm()
|
||||
|
||||
if user_llm_config and user_llm_config.embedding_model_name:
|
||||
# Detect dimension based on model name
|
||||
dimension = detect_embedding_model_dimension(user_llm_config.embedding_model_name)
|
||||
print(f"Detected embedding dimension: {dimension} for model: {user_llm_config.embedding_model_name}")
|
||||
else:
|
||||
# Default to OpenAI dimension if no config found
|
||||
dimension = 1536
|
||||
print(f"No embedding model configured, using default dimension: {dimension}")
|
||||
except Exception as e:
|
||||
# Default to OpenAI dimension if error occurs
|
||||
dimension = 1536
|
||||
print(f"Error detecting embedding dimension, using default: {dimension}. Error: {e}")
|
||||
|
||||
try:
|
||||
client = chromadb.PersistentClient(path=chroma_path)
|
||||
collections_to_init = ["documents", "document_chunks"]
|
||||
dimension_mismatch_detected = False
|
||||
|
||||
# collection: init documents level
|
||||
try:
|
||||
documents_collection = client.get_collection(name="documents")
|
||||
print(f"Collection 'documents' already exists")
|
||||
except ValueError:
|
||||
documents_collection = client.create_collection(
|
||||
name="documents",
|
||||
metadata={
|
||||
"hnsw:space": "cosine",
|
||||
"dimension": 1536
|
||||
}
|
||||
)
|
||||
print(f"Successfully created collection 'documents'")
|
||||
|
||||
# collection: init chunk level
|
||||
try:
|
||||
chunks_collection = client.get_collection(name="document_chunks")
|
||||
print(f"Collection 'document_chunks' already exists")
|
||||
except ValueError:
|
||||
chunks_collection = client.create_collection(
|
||||
name="document_chunks",
|
||||
metadata={
|
||||
"hnsw:space": "cosine",
|
||||
"dimension": 1536
|
||||
}
|
||||
)
|
||||
print(f"Successfully created collection 'document_chunks'")
|
||||
# Check all collections for dimension mismatches first
|
||||
for collection_name in collections_to_init:
|
||||
try:
|
||||
collection = client.get_collection(name=collection_name)
|
||||
print(f"Collection '{collection_name}' already exists")
|
||||
|
||||
# Check if existing collection has the correct dimension
|
||||
if collection.metadata.get("dimension") != dimension:
|
||||
print(f"Warning: Existing '{collection_name}' collection has dimension {collection.metadata.get('dimension')}, but current model requires {dimension}")
|
||||
dimension_mismatch_detected = True
|
||||
except ValueError:
|
||||
# Collection doesn't exist yet, will be created later
|
||||
pass
|
||||
|
||||
# Handle dimension mismatch if detected in any collection
|
||||
if dimension_mismatch_detected:
|
||||
print("Automatically reinitializing ChromaDB collections with the new dimension...")
|
||||
if reinitialize_chroma_collections(dimension):
|
||||
print("Successfully reinitialized ChromaDB collections with the new dimension")
|
||||
else:
|
||||
print("Failed to reinitialize ChromaDB collections, you may need to manually delete the data/chroma_db directory")
|
||||
|
||||
# Create or get collections with the correct dimension
|
||||
for collection_name in collections_to_init:
|
||||
try:
|
||||
collection = client.get_collection(name=collection_name)
|
||||
# Verify dimension after possible reinitialization
|
||||
if collection.metadata.get("dimension") != dimension:
|
||||
print(f"Error: Collection '{collection_name}' still has incorrect dimension after reinitialization: {collection.metadata.get('dimension')} vs {dimension}")
|
||||
except ValueError:
|
||||
# Create collection if it doesn't exist
|
||||
collection = client.create_collection(
|
||||
name=collection_name,
|
||||
metadata={
|
||||
"hnsw:space": "cosine",
|
||||
"dimension": dimension
|
||||
}
|
||||
)
|
||||
print(f"Successfully created collection '{collection_name}' with dimension {dimension}")
|
||||
|
||||
|
||||
print(f"ChromaDB initialized at {chroma_path}")
|
||||
except Exception as e:
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
# Embedding Model Switching Guide
|
||||
|
||||
## Understanding Embedding Dimensions
|
||||
|
||||
When using different embedding models (like switching from OpenAI to Ollama models), you may encounter dimension mismatch issues. This happens because different models produce embedding vectors with different dimensions:
|
||||
|
||||
| Model | Dimension |
|
||||
|-------|----------|
|
||||
| OpenAI text-embedding-ada-002 | 1536 |
|
||||
| OpenAI text-embedding-3-small | 1536 |
|
||||
| OpenAI text-embedding-3-large | 3072 |
|
||||
| Ollama snowflake-arctic-embed | 768 |
|
||||
| Ollama nomic-embed-text | 768 |
|
||||
| Ollama mxbai-embed-large | 1024 |
|
||||
|
||||
## Handling Dimension Mismatches
|
||||
|
||||
Second Me now includes automatic detection and handling of embedding dimension mismatches. When you switch between embedding models with different dimensions, the system will:
|
||||
|
||||
1. Detect the dimension of the new embedding model
|
||||
2. Check if the existing ChromaDB collections have a different dimension
|
||||
3. If a mismatch is detected, automatically reinitialize the collections with the new dimension
|
||||
4. Provide clear error messages and logging information about the process
|
||||
|
||||
## Recommended Workflow for Switching Models
|
||||
|
||||
When switching between embedding models with different dimensions, follow these steps:
|
||||
|
||||
1. Update your embedding model configuration in Settings
|
||||
2. Restart the application to ensure proper initialization
|
||||
3. If you encounter any issues, you can manually reset the vector database:
|
||||
- Delete the contents of the `data/chroma_db` directory
|
||||
- Restart the application
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
The system now automatically handles dimension mismatches when switching between embedding models. You'll see log messages like:
|
||||
|
||||
```
|
||||
Warning: Existing 'documents' collection has dimension X, but current model requires Y
|
||||
Automatically reinitializing ChromaDB collections with the new dimension...
|
||||
Successfully reinitialized ChromaDB collections with the new dimension
|
||||
```
|
||||
|
||||
This indicates that the system has detected and resolved a dimension mismatch automatically. If you still encounter issues after the automatic handling:
|
||||
|
||||
1. Check the application logs for any error messages
|
||||
2. If problems persist, you can manually reset the vector database:
|
||||
- Stop the application
|
||||
- Delete the contents of the `data/chroma_db` directory
|
||||
- Restart the application
|
||||
|
||||
## Technical Details
|
||||
|
||||
The dimension mismatch handling is implemented in:
|
||||
|
||||
- `lpm_kernel/file_data/chroma_utils.py`: Contains utilities for detecting model dimensions and reinitializing collections
|
||||
- `lpm_kernel/file_data/embedding_service.py`: Handles dimension checking during initialization
|
||||
- `docker/app/init_chroma.py`: Performs dimension validation during initial setup
|
||||
|
||||
The system maintains a mapping of known embedding models to their dimensions and will default to 1536 (OpenAI's dimension) for unknown models.
|
|
@ -0,0 +1,155 @@
|
|||
from typing import Optional, Dict, Any, List, Tuple
|
||||
import os
|
||||
import chromadb
|
||||
import logging
|
||||
from lpm_kernel.configs.logging import get_train_process_logger
|
||||
|
||||
logger = get_train_process_logger()
|
||||
|
||||
|
||||
def get_embedding_dimension(embedding: List[float]) -> int:
|
||||
"""
|
||||
Get the dimension of an embedding vector
|
||||
|
||||
Args:
|
||||
embedding: The embedding vector
|
||||
|
||||
Returns:
|
||||
The dimension of the embedding vector
|
||||
"""
|
||||
return len(embedding)
|
||||
|
||||
|
||||
def detect_embedding_model_dimension(model_name: str) -> Optional[int]:
|
||||
"""
|
||||
Detect the dimension of an embedding model based on its name
|
||||
This is a fallback method when we can't get a sample embedding
|
||||
|
||||
Args:
|
||||
model_name: The name of the embedding model
|
||||
|
||||
Returns:
|
||||
The dimension of the embedding model, or None if unknown
|
||||
"""
|
||||
# Common embedding model dimensions
|
||||
model_dimensions = {
|
||||
# OpenAI models
|
||||
"text-embedding-ada-002": 1536,
|
||||
"text-embedding-3-small": 1536,
|
||||
"text-embedding-3-large": 3072,
|
||||
|
||||
# Ollama models
|
||||
"snowflake-arctic-embed": 768,
|
||||
"snowflake-arctic-embed:110m": 768,
|
||||
"nomic-embed-text": 768,
|
||||
"nomic-embed-text:v1.5": 768,
|
||||
"mxbai-embed-large": 1024,
|
||||
"mxbai-embed-large:v1": 1024,
|
||||
}
|
||||
|
||||
# Try to find exact match
|
||||
if model_name in model_dimensions:
|
||||
return model_dimensions[model_name]
|
||||
|
||||
# Try to find partial match
|
||||
for model, dimension in model_dimensions.items():
|
||||
if model in model_name:
|
||||
return dimension
|
||||
|
||||
# Default to OpenAI dimension if unknown
|
||||
logger.warning(f"Unknown embedding model: {model_name}, defaulting to 1536 dimensions")
|
||||
return 1536
|
||||
|
||||
|
||||
def reinitialize_chroma_collections(dimension: int = 1536) -> bool:
|
||||
"""
|
||||
Reinitialize ChromaDB collections with a new dimension
|
||||
|
||||
Args:
|
||||
dimension: The new dimension for the collections
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
chroma_path = os.getenv("CHROMA_PERSIST_DIRECTORY", "./data/chroma_db")
|
||||
client = chromadb.PersistentClient(path=chroma_path)
|
||||
|
||||
# Delete and recreate document collection
|
||||
try:
|
||||
# Check if collection exists before attempting to delete
|
||||
try:
|
||||
client.get_collection(name="documents")
|
||||
client.delete_collection(name="documents")
|
||||
logger.info("Deleted 'documents' collection")
|
||||
except ValueError:
|
||||
logger.info("'documents' collection does not exist, will create new")
|
||||
except Exception as e:
|
||||
logger.error(f"Error deleting 'documents' collection: {str(e)}", exc_info=True)
|
||||
return False
|
||||
|
||||
# Create document collection with new dimension
|
||||
try:
|
||||
client.create_collection(
|
||||
name="documents",
|
||||
metadata={
|
||||
"hnsw:space": "cosine",
|
||||
"dimension": dimension
|
||||
}
|
||||
)
|
||||
logger.info(f"Created 'documents' collection with dimension {dimension}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating 'documents' collection: {str(e)}", exc_info=True)
|
||||
return False
|
||||
|
||||
# Delete and recreate chunk collection
|
||||
try:
|
||||
# Check if collection exists before attempting to delete
|
||||
try:
|
||||
client.get_collection(name="document_chunks")
|
||||
client.delete_collection(name="document_chunks")
|
||||
logger.info("Deleted 'document_chunks' collection")
|
||||
except ValueError:
|
||||
logger.info("'document_chunks' collection does not exist, will create new")
|
||||
except Exception as e:
|
||||
logger.error(f"Error deleting 'document_chunks' collection: {str(e)}", exc_info=True)
|
||||
return False
|
||||
|
||||
# Create chunk collection with new dimension
|
||||
try:
|
||||
client.create_collection(
|
||||
name="document_chunks",
|
||||
metadata={
|
||||
"hnsw:space": "cosine",
|
||||
"dimension": dimension
|
||||
}
|
||||
)
|
||||
logger.info(f"Created 'document_chunks' collection with dimension {dimension}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating 'document_chunks' collection: {str(e)}", exc_info=True)
|
||||
return False
|
||||
|
||||
# Verify collections were created with correct dimension
|
||||
try:
|
||||
doc_collection = client.get_collection(name="documents")
|
||||
chunk_collection = client.get_collection(name="document_chunks")
|
||||
|
||||
doc_dimension = doc_collection.metadata.get("dimension")
|
||||
if doc_dimension != dimension:
|
||||
logger.error(f"Verification failed: 'documents' collection has incorrect dimension: {doc_dimension} vs {dimension}")
|
||||
return False
|
||||
|
||||
chunk_dimension = chunk_collection.metadata.get("dimension")
|
||||
if chunk_dimension != dimension:
|
||||
logger.error(f"Verification failed: 'document_chunks' collection has incorrect dimension: {chunk_dimension} vs {dimension}")
|
||||
return False
|
||||
|
||||
logger.info(f"Verification successful: Both collections have correct dimension: {dimension}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error verifying collections: {str(e)}", exc_info=True)
|
||||
return False
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Error reinitializing ChromaDB collections: {str(e)}", exc_info=True)
|
||||
return False
|
|
@ -12,19 +12,85 @@ logger = get_train_process_logger()
|
|||
|
||||
class EmbeddingService:
|
||||
def __init__(self):
|
||||
from lpm_kernel.file_data.chroma_utils import detect_embedding_model_dimension
|
||||
from lpm_kernel.api.services.user_llm_config_service import UserLLMConfigService
|
||||
|
||||
chroma_path = os.getenv("CHROMA_PERSIST_DIRECTORY", "./data/chroma_db")
|
||||
self.client = chromadb.PersistentClient(path=chroma_path)
|
||||
self.llm_client = LLMClient()
|
||||
|
||||
# Get embedding model dimension from user config
|
||||
try:
|
||||
user_llm_config_service = UserLLMConfigService()
|
||||
user_llm_config = user_llm_config_service.get_available_llm()
|
||||
|
||||
if user_llm_config and user_llm_config.embedding_model_name:
|
||||
# Detect dimension based on model name
|
||||
self.dimension = detect_embedding_model_dimension(user_llm_config.embedding_model_name)
|
||||
logger.info(f"Detected embedding dimension: {self.dimension} for model: {user_llm_config.embedding_model_name}")
|
||||
else:
|
||||
# Default to OpenAI dimension if no config found
|
||||
self.dimension = 1536
|
||||
logger.info(f"No embedding model configured, using default dimension: {self.dimension}")
|
||||
except Exception as e:
|
||||
# Default to OpenAI dimension if error occurs
|
||||
self.dimension = 1536
|
||||
logger.error(f"Error detecting embedding dimension, using default: {self.dimension}. Error: {str(e)}", exc_info=True)
|
||||
|
||||
# document level collection
|
||||
self.document_collection = self.client.get_or_create_collection(
|
||||
name="documents", metadata={"hnsw:space": "cosine", "dimension": 1536}
|
||||
)
|
||||
# Check for dimension mismatches in all collections first
|
||||
collections_to_init = ["documents", "document_chunks"]
|
||||
dimension_mismatch_detected = False
|
||||
|
||||
# First pass: check all collections for dimension mismatches
|
||||
for collection_name in collections_to_init:
|
||||
try:
|
||||
collection = self.client.get_collection(name=collection_name)
|
||||
if collection.metadata.get("dimension") != self.dimension:
|
||||
logger.warning(f"Dimension mismatch in '{collection_name}' collection: {collection.metadata.get('dimension')} vs {self.dimension}")
|
||||
dimension_mismatch_detected = True
|
||||
except ValueError:
|
||||
# Collection doesn't exist yet, will be created later
|
||||
pass
|
||||
|
||||
# Handle dimension mismatch if detected in any collection
|
||||
if dimension_mismatch_detected:
|
||||
self._handle_dimension_mismatch()
|
||||
|
||||
# Second pass: create or get collections with the correct dimension
|
||||
try:
|
||||
self.document_collection = self.client.get_collection(name="documents")
|
||||
# Verify dimension after possible reinitialization
|
||||
doc_dimension = self.document_collection.metadata.get("dimension")
|
||||
if doc_dimension != self.dimension:
|
||||
logger.error(f"Collection 'documents' still has incorrect dimension after reinitialization: {doc_dimension} vs {self.dimension}")
|
||||
# Try to reinitialize again if dimension is still incorrect
|
||||
raise RuntimeError(f"Failed to set correct dimension for 'documents' collection: {doc_dimension} vs {self.dimension}")
|
||||
except ValueError:
|
||||
# Collection doesn't exist, create it with the correct dimension
|
||||
try:
|
||||
self.document_collection = self.client.create_collection(
|
||||
name="documents", metadata={"hnsw:space": "cosine", "dimension": self.dimension}
|
||||
)
|
||||
logger.info(f"Created 'documents' collection with dimension {self.dimension}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create 'documents' collection: {str(e)}", exc_info=True)
|
||||
raise RuntimeError(f"Failed to create 'documents' collection: {str(e)}")
|
||||
|
||||
# chunk level collection
|
||||
self.chunk_collection = self.client.get_or_create_collection(
|
||||
name="document_chunks", metadata={"hnsw:space": "cosine", "dimension": 1536}
|
||||
)
|
||||
try:
|
||||
self.chunk_collection = self.client.get_collection(name="document_chunks")
|
||||
# Verify dimension after possible reinitialization
|
||||
chunk_dimension = self.chunk_collection.metadata.get("dimension")
|
||||
if chunk_dimension != self.dimension:
|
||||
logger.error(f"Collection 'document_chunks' still has incorrect dimension after reinitialization: {chunk_dimension} vs {self.dimension}")
|
||||
# Try to reinitialize again if dimension is still incorrect
|
||||
raise RuntimeError(f"Failed to set correct dimension for 'document_chunks' collection: {chunk_dimension} vs {self.dimension}")
|
||||
except ValueError:
|
||||
# Collection doesn't exist, create it with the correct dimension
|
||||
try:
|
||||
self.chunk_collection = self.client.create_collection(
|
||||
name="document_chunks", metadata={"hnsw:space": "cosine", "dimension": self.dimension}
|
||||
)
|
||||
logger.info(f"Created 'document_chunks' collection with dimension {self.dimension}")
|
||||
|
||||
def generate_document_embedding(self, document: DocumentDTO) -> List[float]:
|
||||
"""Process document level embedding and store in ChromaDB"""
|
||||
|
@ -233,6 +299,45 @@ class EmbeddingService:
|
|||
)
|
||||
raise
|
||||
|
||||
def _handle_dimension_mismatch(self):
|
||||
"""
|
||||
Handle dimension mismatch between current embedding model and ChromaDB collections
|
||||
This method will reinitialize ChromaDB collections with the new dimension
|
||||
"""
|
||||
from lpm_kernel.file_data.chroma_utils import reinitialize_chroma_collections
|
||||
|
||||
logger.warning(f"Detected dimension mismatch in ChromaDB collections. Reinitializing with dimension {self.dimension}")
|
||||
# Log the operation for better debugging
|
||||
logger.info(f"Calling reinitialize_chroma_collections with dimension {self.dimension}")
|
||||
|
||||
try:
|
||||
success = reinitialize_chroma_collections(self.dimension)
|
||||
|
||||
if success:
|
||||
logger.info(f"Successfully reinitialized ChromaDB collections with dimension {self.dimension}")
|
||||
# Refresh collection references
|
||||
try:
|
||||
self.document_collection = self.client.get_collection(name="documents")
|
||||
self.chunk_collection = self.client.get_collection(name="document_chunks")
|
||||
|
||||
# Double-check dimensions after refresh
|
||||
doc_dimension = self.document_collection.metadata.get("dimension")
|
||||
chunk_dimension = self.chunk_collection.metadata.get("dimension")
|
||||
|
||||
if doc_dimension != self.dimension or chunk_dimension != self.dimension:
|
||||
logger.error(f"Dimension mismatch after refresh: documents={doc_dimension}, chunks={chunk_dimension}, expected={self.dimension}")
|
||||
raise RuntimeError(f"Failed to handle dimension mismatch: collections have incorrect dimensions after reinitialization")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error refreshing collection references: {str(e)}", exc_info=True)
|
||||
raise RuntimeError(f"Failed to refresh ChromaDB collections after reinitialization: {str(e)}")
|
||||
else:
|
||||
logger.error("Failed to reinitialize ChromaDB collections")
|
||||
raise RuntimeError("Failed to handle dimension mismatch in ChromaDB collections")
|
||||
except Exception as e:
|
||||
logger.error(f"Error during dimension mismatch handling: {str(e)}", exc_info=True)
|
||||
raise RuntimeError(f"Failed to handle dimension mismatch in ChromaDB collections: {str(e)}")
|
||||
|
||||
def search_similar_chunks(
|
||||
self, query: str, limit: int = 5
|
||||
) -> List[Tuple[ChunkDTO, float]]:
|
||||
|
|
Loading…
Reference in New Issue