feat: Add Dimension Mismatch Handling for ChromaDB (#157) (#207)

* Fix Issue #157 Add chroma_utils.py to manage chromaDB and added docs for explanation * Add logging and debugging process - Enhanced the`reinitialize_chroma_collections` function in`chroma_utils.py` to properly check if collections exist before attempting to delete them, preventing potential errors when collections don't exist. - Improved error handling in the`_handle_dimension_mismatch` method in`embedding_service.py` by adding more robust exception handling and verification steps after reinitialization. - Enhanced the collection initialization process in`embedding_service.py` to provide more detailed error messages and better handle cases where collections still have incorrect dimensions after reinitialization. - Added additional verification steps to ensure that collection dimensions match the expected dimension after creation or retrieval. - Improved logging throughout the code to provide more context in error messages, making debugging easier.
2025-04-22 10:27:34 +08:00 · 2025-04-22 10:27:34 +08:00 · 5868f94622
parent e1ae6f5039
commit 5868f94622
4 changed files with 396 additions and 35 deletions
--- a/docker/app/init_chroma.py
+++ b/docker/app/init_chroma.py
@ -1,5 +1,12 @@
 import chromadb
 import os
+import sys
+
+# Add project root to path to import from lpm_kernel
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from lpm_kernel.api.services.user_llm_config_service import UserLLMConfigService
+from lpm_kernel.file_data.chroma_utils import detect_embedding_model_dimension, reinitialize_chroma_collections

 def init_chroma_db():
    chroma_path = os.getenv("CHROMA_PERSIST_DIRECTORY", "./data/chroma_db")
@ -7,36 +14,69 @@ def init_chroma_db():
    # ensure the directory is correct
    os.makedirs(chroma_path, exist_ok=True)

+    # Get embedding model dimension from user config
+    try:
+        user_llm_config_service = UserLLMConfigService()
+        user_llm_config = user_llm_config_service.get_available_llm()
+        
+        if user_llm_config and user_llm_config.embedding_model_name:
+            # Detect dimension based on model name
+            dimension = detect_embedding_model_dimension(user_llm_config.embedding_model_name)
+            print(f"Detected embedding dimension: {dimension} for model: {user_llm_config.embedding_model_name}")
+        else:
+            # Default to OpenAI dimension if no config found
+            dimension = 1536
+            print(f"No embedding model configured, using default dimension: {dimension}")
+    except Exception as e:
+        # Default to OpenAI dimension if error occurs
+        dimension = 1536
+        print(f"Error detecting embedding dimension, using default: {dimension}. Error: {e}")
+
    try:
        client = chromadb.PersistentClient(path=chroma_path)
+        collections_to_init = ["documents", "document_chunks"]
+        dimension_mismatch_detected = False
        
-        # collection: init documents level
-        try:
-            documents_collection = client.get_collection(name="documents")
-            print(f"Collection 'documents' already exists")
-        except ValueError:
-            documents_collection = client.create_collection(
-                name="documents",
-                metadata={
-                    "hnsw:space": "cosine",
-                    "dimension": 1536
-                }
-            )
-            print(f"Successfully created collection 'documents'")
-            
-        # collection: init chunk level
-        try:
-            chunks_collection = client.get_collection(name="document_chunks")
-            print(f"Collection 'document_chunks' already exists")
-        except ValueError:
-            chunks_collection = client.create_collection(
-                name="document_chunks",
-                metadata={
-                    "hnsw:space": "cosine",
-                    "dimension": 1536
-                }
-            )
-            print(f"Successfully created collection 'document_chunks'")
+        # Check all collections for dimension mismatches first
+        for collection_name in collections_to_init:
+            try:
+                collection = client.get_collection(name=collection_name)
+                print(f"Collection '{collection_name}' already exists")
+                
+                # Check if existing collection has the correct dimension
+                if collection.metadata.get("dimension") != dimension:
+                    print(f"Warning: Existing '{collection_name}' collection has dimension {collection.metadata.get('dimension')}, but current model requires {dimension}")
+                    dimension_mismatch_detected = True
+            except ValueError:
+                # Collection doesn't exist yet, will be created later
+                pass
+        
+        # Handle dimension mismatch if detected in any collection
+        if dimension_mismatch_detected:
+            print("Automatically reinitializing ChromaDB collections with the new dimension...")
+            if reinitialize_chroma_collections(dimension):
+                print("Successfully reinitialized ChromaDB collections with the new dimension")
+            else:
+                print("Failed to reinitialize ChromaDB collections, you may need to manually delete the data/chroma_db directory")
+        
+        # Create or get collections with the correct dimension
+        for collection_name in collections_to_init:
+            try:
+                collection = client.get_collection(name=collection_name)
+                # Verify dimension after possible reinitialization
+                if collection.metadata.get("dimension") != dimension:
+                    print(f"Error: Collection '{collection_name}' still has incorrect dimension after reinitialization: {collection.metadata.get('dimension')} vs {dimension}")
+            except ValueError:
+                # Create collection if it doesn't exist
+                collection = client.create_collection(
+                    name=collection_name,
+                    metadata={
+                        "hnsw:space": "cosine",
+                        "dimension": dimension
+                    }
+                )
+                print(f"Successfully created collection '{collection_name}' with dimension {dimension}")
+
        
        print(f"ChromaDB initialized at {chroma_path}")
    except Exception as e:
--- a/docs/Embedding
+++ b/docs/Embedding
@ -0,0 +1,61 @@
+# Embedding Model Switching Guide
+
+## Understanding Embedding Dimensions
+
+When using different embedding models (like switching from OpenAI to Ollama models), you may encounter dimension mismatch issues. This happens because different models produce embedding vectors with different dimensions:
+
+| Model | Dimension |
+|-------|----------|
+| OpenAI text-embedding-ada-002 | 1536 |
+| OpenAI text-embedding-3-small | 1536 |
+| OpenAI text-embedding-3-large | 3072 |
+| Ollama snowflake-arctic-embed | 768 |
+| Ollama nomic-embed-text | 768 |
+| Ollama mxbai-embed-large | 1024 |
+
+## Handling Dimension Mismatches
+
+Second Me now includes automatic detection and handling of embedding dimension mismatches. When you switch between embedding models with different dimensions, the system will:
+
+1. Detect the dimension of the new embedding model
+2. Check if the existing ChromaDB collections have a different dimension
+3. If a mismatch is detected, automatically reinitialize the collections with the new dimension
+4. Provide clear error messages and logging information about the process
+
+## Recommended Workflow for Switching Models
+
+When switching between embedding models with different dimensions, follow these steps:
+
+1. Update your embedding model configuration in Settings
+2. Restart the application to ensure proper initialization
+3. If you encounter any issues, you can manually reset the vector database:
+   - Delete the contents of the `data/chroma_db` directory
+   - Restart the application
+
+## Troubleshooting
+
+The system now automatically handles dimension mismatches when switching between embedding models. You'll see log messages like:
+
+```
+Warning: Existing 'documents' collection has dimension X, but current model requires Y
+Automatically reinitializing ChromaDB collections with the new dimension...
+Successfully reinitialized ChromaDB collections with the new dimension
+```
+
+This indicates that the system has detected and resolved a dimension mismatch automatically. If you still encounter issues after the automatic handling:
+
+1. Check the application logs for any error messages
+2. If problems persist, you can manually reset the vector database:
+   - Stop the application
+   - Delete the contents of the `data/chroma_db` directory
+   - Restart the application
+
+## Technical Details
+
+The dimension mismatch handling is implemented in:
+
+- `lpm_kernel/file_data/chroma_utils.py`: Contains utilities for detecting model dimensions and reinitializing collections
+- `lpm_kernel/file_data/embedding_service.py`: Handles dimension checking during initialization
+- `docker/app/init_chroma.py`: Performs dimension validation during initial setup
+
+The system maintains a mapping of known embedding models to their dimensions and will default to 1536 (OpenAI's dimension) for unknown models.
--- a/lpm_kernel/file_data/chroma_utils.py
+++ b/lpm_kernel/file_data/chroma_utils.py
@ -0,0 +1,155 @@
+from typing import Optional, Dict, Any, List, Tuple
+import os
+import chromadb
+import logging
+from lpm_kernel.configs.logging import get_train_process_logger
+
+logger = get_train_process_logger()
+
+
+def get_embedding_dimension(embedding: List[float]) -> int:
+    """
+    Get the dimension of an embedding vector
+    
+    Args:
+        embedding: The embedding vector
+        
+    Returns:
+        The dimension of the embedding vector
+    """
+    return len(embedding)
+
+
+def detect_embedding_model_dimension(model_name: str) -> Optional[int]:
+    """
+    Detect the dimension of an embedding model based on its name
+    This is a fallback method when we can't get a sample embedding
+    
+    Args:
+        model_name: The name of the embedding model
+        
+    Returns:
+        The dimension of the embedding model, or None if unknown
+    """
+    # Common embedding model dimensions
+    model_dimensions = {
+        # OpenAI models
+        "text-embedding-ada-002": 1536,
+        "text-embedding-3-small": 1536,
+        "text-embedding-3-large": 3072,
+        
+        # Ollama models
+        "snowflake-arctic-embed": 768,
+        "snowflake-arctic-embed:110m": 768,
+        "nomic-embed-text": 768,
+        "nomic-embed-text:v1.5": 768,
+        "mxbai-embed-large": 1024,
+        "mxbai-embed-large:v1": 1024,
+    }
+    
+    # Try to find exact match
+    if model_name in model_dimensions:
+        return model_dimensions[model_name]
+    
+    # Try to find partial match
+    for model, dimension in model_dimensions.items():
+        if model in model_name:
+            return dimension
+    
+    # Default to OpenAI dimension if unknown
+    logger.warning(f"Unknown embedding model: {model_name}, defaulting to 1536 dimensions")
+    return 1536
+
+
+def reinitialize_chroma_collections(dimension: int = 1536) -> bool:
+    """
+    Reinitialize ChromaDB collections with a new dimension
+    
+    Args:
+        dimension: The new dimension for the collections
+        
+    Returns:
+        True if successful, False otherwise
+    """
+    try:
+        chroma_path = os.getenv("CHROMA_PERSIST_DIRECTORY", "./data/chroma_db")
+        client = chromadb.PersistentClient(path=chroma_path)
+        
+        # Delete and recreate document collection
+        try:
+            # Check if collection exists before attempting to delete
+            try:
+                client.get_collection(name="documents")
+                client.delete_collection(name="documents")
+                logger.info("Deleted 'documents' collection")
+            except ValueError:
+                logger.info("'documents' collection does not exist, will create new")
+        except Exception as e:
+            logger.error(f"Error deleting 'documents' collection: {str(e)}", exc_info=True)
+            return False
+        
+        # Create document collection with new dimension
+        try:
+            client.create_collection(
+                name="documents",
+                metadata={
+                    "hnsw:space": "cosine",
+                    "dimension": dimension
+                }
+            )
+            logger.info(f"Created 'documents' collection with dimension {dimension}")
+        except Exception as e:
+            logger.error(f"Error creating 'documents' collection: {str(e)}", exc_info=True)
+            return False
+        
+        # Delete and recreate chunk collection
+        try:
+            # Check if collection exists before attempting to delete
+            try:
+                client.get_collection(name="document_chunks")
+                client.delete_collection(name="document_chunks")
+                logger.info("Deleted 'document_chunks' collection")
+            except ValueError:
+                logger.info("'document_chunks' collection does not exist, will create new")
+        except Exception as e:
+            logger.error(f"Error deleting 'document_chunks' collection: {str(e)}", exc_info=True)
+            return False
+        
+        # Create chunk collection with new dimension
+        try:
+            client.create_collection(
+                name="document_chunks",
+                metadata={
+                    "hnsw:space": "cosine",
+                    "dimension": dimension
+                }
+            )
+            logger.info(f"Created 'document_chunks' collection with dimension {dimension}")
+        except Exception as e:
+            logger.error(f"Error creating 'document_chunks' collection: {str(e)}", exc_info=True)
+            return False
+        
+        # Verify collections were created with correct dimension
+        try:
+            doc_collection = client.get_collection(name="documents")
+            chunk_collection = client.get_collection(name="document_chunks")
+            
+            doc_dimension = doc_collection.metadata.get("dimension")
+            if doc_dimension != dimension:
+                logger.error(f"Verification failed: 'documents' collection has incorrect dimension: {doc_dimension} vs {dimension}")
+                return False
+                
+            chunk_dimension = chunk_collection.metadata.get("dimension")
+            if chunk_dimension != dimension:
+                logger.error(f"Verification failed: 'document_chunks' collection has incorrect dimension: {chunk_dimension} vs {dimension}")
+                return False
+                
+            logger.info(f"Verification successful: Both collections have correct dimension: {dimension}")
+        except Exception as e:
+            logger.error(f"Error verifying collections: {str(e)}", exc_info=True)
+            return False
+        
+        return True
+    except Exception as e:
+        logger.error(f"Error reinitializing ChromaDB collections: {str(e)}", exc_info=True)
+        return False
--- a/lpm_kernel/file_data/embedding_service.py
+++ b/lpm_kernel/file_data/embedding_service.py
@ -12,19 +12,85 @@ logger = get_train_process_logger()

 class EmbeddingService:
    def __init__(self):
+        from lpm_kernel.file_data.chroma_utils import detect_embedding_model_dimension
+        from lpm_kernel.api.services.user_llm_config_service import UserLLMConfigService
+        
        chroma_path = os.getenv("CHROMA_PERSIST_DIRECTORY", "./data/chroma_db")
        self.client = chromadb.PersistentClient(path=chroma_path)
        self.llm_client = LLMClient()
+        
+        # Get embedding model dimension from user config
+        try:
+            user_llm_config_service = UserLLMConfigService()
+            user_llm_config = user_llm_config_service.get_available_llm()
+            
+            if user_llm_config and user_llm_config.embedding_model_name:
+                # Detect dimension based on model name
+                self.dimension = detect_embedding_model_dimension(user_llm_config.embedding_model_name)
+                logger.info(f"Detected embedding dimension: {self.dimension} for model: {user_llm_config.embedding_model_name}")
+            else:
+                # Default to OpenAI dimension if no config found
+                self.dimension = 1536
+                logger.info(f"No embedding model configured, using default dimension: {self.dimension}")
+        except Exception as e:
+            # Default to OpenAI dimension if error occurs
+            self.dimension = 1536
+            logger.error(f"Error detecting embedding dimension, using default: {self.dimension}. Error: {str(e)}", exc_info=True)

-        # document level collection
-        self.document_collection = self.client.get_or_create_collection(
-            name="documents", metadata={"hnsw:space": "cosine", "dimension": 1536}
-        )
+        # Check for dimension mismatches in all collections first
+        collections_to_init = ["documents", "document_chunks"]
+        dimension_mismatch_detected = False
+        
+        # First pass: check all collections for dimension mismatches
+        for collection_name in collections_to_init:
+            try:
+                collection = self.client.get_collection(name=collection_name)
+                if collection.metadata.get("dimension") != self.dimension:
+                    logger.warning(f"Dimension mismatch in '{collection_name}' collection: {collection.metadata.get('dimension')} vs {self.dimension}")
+                    dimension_mismatch_detected = True
+            except ValueError:
+                # Collection doesn't exist yet, will be created later
+                pass
+        
+        # Handle dimension mismatch if detected in any collection
+        if dimension_mismatch_detected:
+            self._handle_dimension_mismatch()
+        
+        # Second pass: create or get collections with the correct dimension
+        try:
+            self.document_collection = self.client.get_collection(name="documents")
+            # Verify dimension after possible reinitialization
+            doc_dimension = self.document_collection.metadata.get("dimension")
+            if doc_dimension != self.dimension:
+                logger.error(f"Collection 'documents' still has incorrect dimension after reinitialization: {doc_dimension} vs {self.dimension}")
+                # Try to reinitialize again if dimension is still incorrect
+                raise RuntimeError(f"Failed to set correct dimension for 'documents' collection: {doc_dimension} vs {self.dimension}")
+        except ValueError:
+            # Collection doesn't exist, create it with the correct dimension
+            try:
+                self.document_collection = self.client.create_collection(
+                    name="documents", metadata={"hnsw:space": "cosine", "dimension": self.dimension}
+                )
+                logger.info(f"Created 'documents' collection with dimension {self.dimension}")
+            except Exception as e:
+                logger.error(f"Failed to create 'documents' collection: {str(e)}", exc_info=True)
+                raise RuntimeError(f"Failed to create 'documents' collection: {str(e)}")

-        # chunk level collection
-        self.chunk_collection = self.client.get_or_create_collection(
-            name="document_chunks", metadata={"hnsw:space": "cosine", "dimension": 1536}
-        )
+        try:
+            self.chunk_collection = self.client.get_collection(name="document_chunks")
+            # Verify dimension after possible reinitialization
+            chunk_dimension = self.chunk_collection.metadata.get("dimension")
+            if chunk_dimension != self.dimension:
+                logger.error(f"Collection 'document_chunks' still has incorrect dimension after reinitialization: {chunk_dimension} vs {self.dimension}")
+                # Try to reinitialize again if dimension is still incorrect
+                raise RuntimeError(f"Failed to set correct dimension for 'document_chunks' collection: {chunk_dimension} vs {self.dimension}")
+        except ValueError:
+            # Collection doesn't exist, create it with the correct dimension
+            try:
+                self.chunk_collection = self.client.create_collection(
+                    name="document_chunks", metadata={"hnsw:space": "cosine", "dimension": self.dimension}
+                )
+            logger.info(f"Created 'document_chunks' collection with dimension {self.dimension}")

    def generate_document_embedding(self, document: DocumentDTO) -> List[float]:
        """Process document level embedding and store in ChromaDB"""
@ -233,6 +299,45 @@ class EmbeddingService:
            )
            raise

+    def _handle_dimension_mismatch(self):
+        """
+        Handle dimension mismatch between current embedding model and ChromaDB collections
+        This method will reinitialize ChromaDB collections with the new dimension
+        """
+        from lpm_kernel.file_data.chroma_utils import reinitialize_chroma_collections
+        
+        logger.warning(f"Detected dimension mismatch in ChromaDB collections. Reinitializing with dimension {self.dimension}")
+        # Log the operation for better debugging
+        logger.info(f"Calling reinitialize_chroma_collections with dimension {self.dimension}")
+        
+        try:
+            success = reinitialize_chroma_collections(self.dimension)
+            
+            if success:
+                logger.info(f"Successfully reinitialized ChromaDB collections with dimension {self.dimension}")
+                # Refresh collection references
+                try:
+                    self.document_collection = self.client.get_collection(name="documents")
+                    self.chunk_collection = self.client.get_collection(name="document_chunks")
+                    
+                    # Double-check dimensions after refresh
+                    doc_dimension = self.document_collection.metadata.get("dimension")
+                    chunk_dimension = self.chunk_collection.metadata.get("dimension")
+                    
+                    if doc_dimension != self.dimension or chunk_dimension != self.dimension:
+                        logger.error(f"Dimension mismatch after refresh: documents={doc_dimension}, chunks={chunk_dimension}, expected={self.dimension}")
+                        raise RuntimeError(f"Failed to handle dimension mismatch: collections have incorrect dimensions after reinitialization")
+                        
+                except Exception as e:
+                    logger.error(f"Error refreshing collection references: {str(e)}", exc_info=True)
+                    raise RuntimeError(f"Failed to refresh ChromaDB collections after reinitialization: {str(e)}")
+            else:
+                logger.error("Failed to reinitialize ChromaDB collections")
+                raise RuntimeError("Failed to handle dimension mismatch in ChromaDB collections")
+        except Exception as e:
+            logger.error(f"Error during dimension mismatch handling: {str(e)}", exc_info=True)
+            raise RuntimeError(f"Failed to handle dimension mismatch in ChromaDB collections: {str(e)}")
+    
    def search_similar_chunks(
        self, query: str, limit: int = 5
    ) -> List[Tuple[ChunkDTO, float]]: