Added CUDA support (#228)

* Add CUDA support - CUDA detection - Memory handling - Ollama model release after training * Fix logging issue added cuda support flag so log accurately reflected cuda toggle * Update llama.cpp rebuild Changed llama.cpp to only check if cuda support is enabled and if so rebuild during the first build rather than each run * Improved vram management Enabled memory pinning and optimizer state offload * Fix CUDA check rewrote llama.cpp rebuild logic, added manual y/n toggle if user wants to enable cuda support * Added fast restart and fixed CUDA check command Added make docker-restart-backend-fast to restart the backend and reflect code changes without causing a full llama.cpp rebuild Fixed make docker-check-cuda command to correctly reflect cuda support * Added docker-compose.gpu.yml Added docker-compose.gpu.yml to fix error on machines without nvidia gpu and made sure "\n" is added before .env modification * Fixed cuda toggle Last push accidentally broke cuda toggle * Code review fixes Fixed errors resulting from removed code: - Added return save_path to end of save_hf_model function - Rolled back download_file_with_progress function * Update Makefile Use cuda by default when using docker-restart-backend-fast * Minor cleanup Removed unnecessary makefile command and fixed gpu logging * Delete .gpu_selected * Simplified cuda training code - Removed dtype setting to let torch automatically handle it - Removed vram logging - Removed Unnecessary/old comments * Fixed gpu/cpu selection Made "make docker-use-gpu/cpu" command work with .gpu_selected flag and changed "make docker-restart-backend-fast" command to respect flag instead of always using gpu * Fix Ollama embedding error Added custom exception class for Ollama embeddings, which seemed to be returning keyword arguments while the Python exception class only accepts positional ones * Fixed model selection & memory error Fixed training defaulting to 0.5B model regardless of selection and fixed "free(): double free detected in tcache 2" error caused by cuda flag being passed incorrectly
2025-04-24 22:20:36 -04:00 · 2025-04-24 22:20:36 -04:00 · 053090937d
parent 71d54a5b0b
commit 053090937d
36 changed files with 2375 additions and 507 deletions
--- a/.env
+++ b/.env
@ -65,3 +65,5 @@ DOCUMENT_CHUNK_OVERLAP=200

 # Embedding configurations
 EMBEDDING_MAX_TEXT_LENGTH=3072
+
+DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda
--- a/.gitignore
+++ b/.gitignore
@ -75,3 +75,8 @@ run/*
 .backend.pid
 .frontend.pid
 logs/train/
+llama_cpp_backup/llama.cpp.zip
+scripts/check_cuda_status.ps1
+scripts/test_cuda_detection.bat
+.env
+.gpu_selected
--- a/Dockerfile.backend
+++ b/Dockerfile.backend
@ -1,3 +1,6 @@
+# Base image selection is handled by the Makefile or build script
+# For CUDA support: FROM nvidia/cuda:12.8.1-devel-ubuntu22.04
+# For CPU-only: FROM python:3.12
 FROM python:3.12

 # Set working directory
@ -19,28 +22,31 @@ RUN mkdir -p /app/dependencies /app/data/sqlite /app/data/chroma_db /app/logs /a
 COPY dependencies/graphrag-1.2.1.dev27.tar.gz /app/dependencies/
 COPY dependencies/llama.cpp.zip /app/dependencies/

-# Build llama.cpp
+# Copy GPU checker script (only used for status reporting, not rebuilding)
+COPY docker/app/check_gpu_support.sh /app/
+COPY docker/app/check_torch_cuda.py /app/
+RUN chmod +x /app/check_gpu_support.sh
+
+# Unpack prebuilt llama.cpp CPU binary (no build or GPU detection here)
 RUN LLAMA_LOCAL_ZIP="dependencies/llama.cpp.zip" \
    && echo "Using local llama.cpp archive..." \
    && unzip -q "$LLAMA_LOCAL_ZIP" \
    && cd llama.cpp \
-    && mkdir -p build && cd build \
-    && cmake .. \
-    && cmake --build . --config Release \
-    && if [ ! -f "bin/llama-server" ]; then \
-         echo "Build failed: llama-server executable not found" && exit 1; \
-       else \
-         echo "Successfully built llama-server"; \
-       fi
+    && mkdir -p build \
+    && cp -r bin build/ 2>/dev/null || true \
+    && chmod +x /app/llama.cpp/build/bin/llama-server /app/llama.cpp/build/bin/llama-cli 2>/dev/null || true
+
+# Mark as CPU-only build for runtime reference
+RUN mkdir -p /app/data && \
+    echo "{ \"gpu_optimized\": false, \"optimized_on\": \"$(date -u +\"%Y-%m-%dT%H:%M:%SZ\")\" }" > /app/data/gpu_optimized.json && \
+    echo "Created CPU-only marker file"

-# 
 # Copy project configuration - Files that occasionally change
 COPY pyproject.toml README.md /app/

 RUN poetry install --no-interaction --no-root
 RUN pip install --force-reinstall dependencies/graphrag-1.2.1.dev27.tar.gz

-
 # Copy source code - Files that frequently change
 COPY docker/ /app/docker/
 COPY lpm_kernel/ /app/lpm_kernel/
@ -61,5 +67,5 @@ ENV PYTHONUNBUFFERED=1 \
 # Expose ports
 EXPOSE 8002 8080

-# Set the startup command
-CMD ["bash", "-c", "echo \"Checking SQLite database...\" && if [ ! -s /app/data/sqlite/lpm.db ]; then echo \"SQLite database not found or empty, initializing...\" && mkdir -p /app/data/sqlite && sqlite3 /app/data/sqlite/lpm.db \".read /app/docker/sqlite/init.sql\" && echo \"SQLite database initialized successfully\" && echo \"Tables created:\" && sqlite3 /app/data/sqlite/lpm.db \".tables\"; else echo \"SQLite database already exists, skipping initialization\"; fi && echo \"Checking ChromaDB...\" && if [ ! -d /app/data/chroma_db/documents ] || [ ! -d /app/data/chroma_db/document_chunks ]; then echo \"ChromaDB collections not found, initializing...\" && python /app/docker/app/init_chroma.py && echo \"ChromaDB initialized successfully\"; else echo \"ChromaDB already exists, skipping initialization\"; fi && echo \"Starting application at $(date)\" >> /app/logs/backend.log && cd /app && python -m flask run --host=0.0.0.0 --port=${LOCAL_APP_PORT:-8002} >> /app/logs/backend.log 2>&1"]
+# Set the startup command - CUDA check/rebuild removed since it's now handled at build time
+CMD ["bash", "-c", "echo 'Checking SQLite database...' && if [ ! -s /app/data/sqlite/lpm.db ]; then echo 'SQLite database not found or empty, initializing...' && mkdir -p /app/data/sqlite && sqlite3 /app/data/sqlite/lpm.db '.read /app/docker/sqlite/init.sql' && echo 'SQLite database initialized successfully' && echo 'Tables created:' && sqlite3 /app/data/sqlite/lpm.db '.tables'; else echo 'SQLite database already exists, skipping initialization'; fi && echo 'Checking ChromaDB...' && if [ ! -d /app/data/chroma_db/documents ] || [ ! -d /app/data/chroma_db/document_chunks ]; then echo 'ChromaDB collections not found, initializing...' && python /app/docker/app/init_chroma.py && echo 'ChromaDB initialized successfully'; else echo 'ChromaDB already exists, skipping initialization'; fi && echo 'Starting application at ' $(date) >> /app/logs/backend.log && cd /app && python -m flask run --host=0.0.0.0 --port=${LOCAL_APP_PORT:-8002} >> /app/logs/backend.log 2>&1"]
--- a/Dockerfile.backend.cuda
+++ b/Dockerfile.backend.cuda
@ -0,0 +1,111 @@
+FROM nvidia/cuda:12.8.1-devel-ubuntu24.04
+
+# Set working directory
+WORKDIR /app
+
+# Add build argument to conditionally skip llama.cpp build
+ARG SKIP_LLAMA_BUILD=false
+
+# Install system dependencies with noninteractive mode to avoid prompts
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential cmake git curl wget lsof vim unzip sqlite3 \
+    python3-pip python3-venv python3-full python3-poetry pipx \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/* \
+    && ln -sf /usr/bin/python3 /usr/bin/python
+
+# Create a virtual environment to avoid PEP 668 restrictions
+RUN python -m venv /app/venv
+ENV PATH="/app/venv/bin:$PATH"
+ENV VIRTUAL_ENV="/app/venv"
+
+# Use the virtual environment's pip to install packages
+RUN pip install --upgrade pip \
+    && pip install poetry \
+    && poetry config virtualenvs.create false
+
+# Create directories
+RUN mkdir -p /app/dependencies /app/data/sqlite /app/data/chroma_db /app/logs /app/run /app/resources
+
+# Copy dependency files - Files that rarely change
+COPY dependencies/graphrag-1.2.1.dev27.tar.gz /app/dependencies/
+COPY dependencies/llama.cpp.zip /app/dependencies/
+
+# Copy GPU checker script
+COPY docker/app/check_gpu_support.sh /app/
+COPY docker/app/check_torch_cuda.py /app/
+RUN chmod +x /app/check_gpu_support.sh
+
+# Unpack llama.cpp and build with CUDA support (conditionally, based on SKIP_LLAMA_BUILD)
+RUN if [ "$SKIP_LLAMA_BUILD" = "false" ]; then \
+        echo "=====================================================================" && \
+        echo "STARTING LLAMA.CPP BUILD WITH CUDA SUPPORT - THIS WILL TAKE SOME TIME" && \
+        echo "=====================================================================" && \
+        LLAMA_LOCAL_ZIP="dependencies/llama.cpp.zip" && \
+        echo "Using local llama.cpp archive..." && \
+        unzip -q "$LLAMA_LOCAL_ZIP" && \
+        cd llama.cpp && \
+        mkdir -p build && \
+        cd build && \
+        echo "Starting CMake configuration with CUDA support..." && \
+        cmake -DGGML_CUDA=ON \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DBUILD_SHARED_LIBS=OFF \
+              -DLLAMA_NATIVE=OFF \
+              -DCMAKE_CUDA_FLAGS="-Wno-deprecated-gpu-targets" \
+              .. && \
+        echo "Starting build process (this will take several minutes)..." && \
+        cmake --build . --config Release -j --verbose && \
+        echo "Build completed successfully" && \
+        chmod +x /app/llama.cpp/build/bin/llama-server /app/llama.cpp/build/bin/llama-cli && \
+        echo "====================================================================" && \
+        echo "CUDA BUILD COMPLETED SUCCESSFULLY! GPU ACCELERATION IS NOW AVAILABLE" && \
+        echo "===================================================================="; \
+    else \
+        echo "=====================================================================" && \
+        echo "SKIPPING LLAMA.CPP BUILD (SKIP_LLAMA_BUILD=$SKIP_LLAMA_BUILD)" && \
+        echo "Using existing llama.cpp build from Docker volume" && \
+        echo "=====================================================================" && \
+        LLAMA_LOCAL_ZIP="dependencies/llama.cpp.zip" && \
+        echo "Just unpacking llama.cpp archive (no build)..." && \
+        unzip -q "$LLAMA_LOCAL_ZIP" && \
+        cd llama.cpp && \
+        mkdir -p build; \
+    fi
+
+# Mark as GPU-optimized build for runtime reference
+RUN mkdir -p /app/data && \
+    echo "{ \"gpu_optimized\": true, \"optimized_on\": \"$(date -u +\"%Y-%m-%dT%H:%M:%SZ\")\" }" > /app/data/gpu_optimized.json && \
+    echo "Created GPU-optimized marker file"
+
+# Copy project configuration - Files that occasionally change
+COPY pyproject.toml README.md /app/
+
+# Fix for potential package installation issues with Poetry
+RUN pip install --upgrade setuptools wheel
+RUN poetry install --no-interaction --no-root || poetry install --no-interaction --no-root --without dev
+RUN pip install --force-reinstall dependencies/graphrag-1.2.1.dev27.tar.gz
+
+# Copy source code - Files that frequently change
+COPY docker/ /app/docker/
+COPY lpm_kernel/ /app/lpm_kernel/
+
+# Check module import
+RUN python -c "import lpm_kernel; print('Module import check passed')"
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONPATH=/app \
+    BASE_DIR=/app/data \
+    LOCAL_LOG_DIR=/app/logs \
+    RUN_DIR=/app/run \
+    RESOURCES_DIR=/app/resources \
+    APP_ROOT=/app \
+    FLASK_APP=lpm_kernel.app \
+    LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+# Expose ports
+EXPOSE 8002 8080
+
+# Set the startup command
+CMD ["bash", "-c", "echo 'Checking SQLite database...' && if [ ! -s /app/data/sqlite/lpm.db ]; then echo 'SQLite database not found or empty, initializing...' && mkdir -p /app/data/sqlite && sqlite3 /app/data/sqlite/lpm.db '.read /app/docker/sqlite/init.sql' && echo 'SQLite database initialized successfully' && echo 'Tables created:' && sqlite3 /app/data/sqlite/lpm.db '.tables'; else echo 'SQLite database already exists, skipping initialization'; fi && echo 'Checking ChromaDB...' && if [ ! -d /app/data/chroma_db/documents ] || [ ! -d /app/data/chroma_db/document_chunks ]; then echo 'ChromaDB collections not found, initializing...' && python /app/docker/app/init_chroma.py && echo 'ChromaDB initialized successfully'; else echo 'ChromaDB already exists, skipping initialization'; fi && echo 'Starting application at ' $(date) >> /app/logs/backend.log && cd /app && python -m flask run --host=0.0.0.0 --port=${LOCAL_APP_PORT:-8002} >> /app/logs/backend.log 2>&1"]
--- a/141
+++ b/141
@ -1,4 +1,13 @@
-.PHONY: install test format lint all setup start stop restart restart-backend restart-force help docker-build docker-up docker-down docker-build-backend docker-build-frontend docker-restart-backend docker-restart-frontend docker-restart-all
+.PHONY: install test format lint all setup start stop restart restart-backend restart-force help docker-build docker-up docker-down docker-build-backend docker-build-frontend docker-restart-backend docker-restart-backend-fast docker-restart-backend-smart docker-restart-frontend docker-restart-all docker-check-cuda docker-use-gpu docker-use-cpu
+
+# Check for GPU flag file and set Docker Compose file accordingly
+ifeq ($(wildcard .gpu_selected),)
+    # No GPU flag file found, use CPU configuration
+    DOCKER_COMPOSE_FILE := docker-compose.yml
+else
+    # GPU flag file found, use GPU configuration
+    DOCKER_COMPOSE_FILE := docker-compose-gpu.yml
+endif

 # Detect operating system and set environment
 ifeq ($(OS),Windows_NT)
@ -39,6 +48,9 @@ else
    COLOR_RED := \033[1;31m
 endif

+# Default Docker Compose configuration (non-GPU)
+DOCKER_COMPOSE_FILE := docker-compose.yml
+
 # Show help message
 help:
 ifeq ($(WINDOWS),1)
@ -69,8 +81,12 @@ ifeq ($(WINDOWS),1)
 	@echo   make docker-build-backend  - Build only backend Docker image
 	@echo   make docker-build-frontend - Build only frontend Docker image
 	@echo   make docker-restart-backend - Restart only backend container
+	@echo   make docker-restart-backend-fast - Restart backend+cuda without rebuilding llama.cpp
 	@echo   make docker-restart-frontend - Restart only frontend container
 	@echo   make docker-restart-all    - Restart all Docker containers
+	@echo   make docker-check-cuda     - Check CUDA support in containers
+	@echo   make docker-use-gpu        - Switch to GPU configuration
+	@echo   make docker-use-cpu        - Switch to CPU-only configuration
 	@echo.
 	@echo All Available Commands:
 	@echo   make help                  - Show this help message
@ -106,9 +122,13 @@ else
 	@echo "  make docker-down           - Stop all Docker containers"
 	@echo "  make docker-build-backend  - Build only backend Docker image"
 	@echo "  make docker-build-frontend - Build only frontend Docker image"
-	@echo "  make docker-restart-backend - Restart only backend container"
+	@echo "  make docker-restart-backend - Restart only backend container (with rebuild)"
+	@echo "  make docker-restart-backend-fast - Restart backend+cuda without rebuilding llama.cpp"
 	@echo "  make docker-restart-frontend - Restart only frontend container"
 	@echo "  make docker-restart-all    - Restart all Docker containers"
+	@echo "  make docker-check-cuda     - Check CUDA support in containers"
+	@echo "  make docker-use-gpu        - Switch to GPU configuration"
+	@echo "  make docker-use-cpu        - Switch to CPU-only configuration"
 	@echo ""
 	@echo "$(COLOR_BOLD)All Available Commands:$(COLOR_RESET)"
 	@echo "  make help                  - Show this help message"
@ -124,6 +144,27 @@ else
 	fi
 endif

+# Configuration switchers for Docker
+docker-use-gpu:
+	@echo "Switching to GPU configuration..."
+ifeq ($(WINDOWS),1)
+	@echo GPU mode enabled. Docker commands will use docker-compose-gpu.yml
+	@echo gpu > .gpu_selected
+else
+	@echo "$(COLOR_GREEN)GPU mode enabled. Docker commands will use docker-compose-gpu.yml$(COLOR_RESET)"
+	@echo "gpu" > .gpu_selected
+endif
+
+docker-use-cpu:
+	@echo "Switching to CPU-only configuration..."
+ifeq ($(WINDOWS),1)
+	@echo CPU-only mode enabled. Docker commands will use docker-compose.yml
+	@rm -f .gpu_selected
+else
+	@echo "$(COLOR_GREEN)CPU-only mode enabled. Docker commands will use docker-compose.yml$(COLOR_RESET)"
+	@rm -f .gpu_selected
+endif
+
 setup:
 	./scripts/setup.sh

@ -156,37 +197,99 @@ DOCKER_COMPOSE_CMD := $(shell if command -v docker-compose >/dev/null 2>&1; then
 endif

 docker-build:
-	$(DOCKER_COMPOSE_CMD) build
+ifeq ($(WINDOWS),1)
+	@echo "Prompting for CUDA preference..."
+	@scripts\prompt_cuda.bat
+else
+	@echo "Prompting for CUDA preference..."
+	@chmod +x ./scripts/prompt_cuda.sh
+	@./scripts/prompt_cuda.sh
+endif
+	$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build

 docker-up:
-	$(DOCKER_COMPOSE_CMD) up -d
+	@echo "Building and starting Docker containers..."
+ifeq ($(WINDOWS),1)
+	@echo "Prompting for CUDA preference..."
+	@scripts\prompt_cuda.bat
+	@echo "Checking CUDA preference..."
+	@cmd /c "if exist .gpu_selected ( echo CUDA support detected, using GPU configuration... & docker compose -f docker-compose-gpu.yml build --no-cache & docker compose -f docker-compose-gpu.yml up -d ) else ( echo No CUDA support selected, using CPU-only configuration... & docker compose -f docker-compose.yml build --no-cache & docker compose -f docker-compose.yml up -d )"
+else
+	@echo "Prompting for CUDA preference..."
+	@chmod +x ./scripts/prompt_cuda.sh
+	@./scripts/prompt_cuda.sh
+	@echo "Checking CUDA preference..."
+	@if [ -f .gpu_selected ]; then \
+		echo "CUDA support detected, using GPU configuration..."; \
+		$(DOCKER_COMPOSE_CMD) -f docker-compose-gpu.yml build; \
+		$(DOCKER_COMPOSE_CMD) -f docker-compose-gpu.yml up -d; \
+	else \
+		echo "No CUDA support selected, using CPU-only configuration..."; \
+		$(DOCKER_COMPOSE_CMD) -f docker-compose.yml build; \
+		$(DOCKER_COMPOSE_CMD) -f docker-compose.yml up -d; \
+	fi
+endif
+	@echo "Container startup complete"
+	@echo "Check CUDA support with: make docker-check-cuda"

 docker-down:
-	$(DOCKER_COMPOSE_CMD) down
+	$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) down

 docker-build-backend:
-	$(DOCKER_COMPOSE_CMD) build backend
+	$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build backend

 docker-build-frontend:
-	$(DOCKER_COMPOSE_CMD) build frontend
+	$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build frontend

+# Standard backend restart with complete rebuild
 docker-restart-backend:
-	$(DOCKER_COMPOSE_CMD) stop backend
-	$(DOCKER_COMPOSE_CMD) rm -f backend
-	$(DOCKER_COMPOSE_CMD) build backend || { echo "$(COLOR_RED)❌ Backend build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
-	$(DOCKER_COMPOSE_CMD) up -d backend
+	$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) stop backend
+	$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) rm -f backend
+	$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build backend || { echo "$(COLOR_RED)❌ Backend build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
+	$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) up -d backend
+
+
+# Fast backend restart: preserves llama.cpp build
+docker-restart-backend-fast:
+	@echo "Smart restarting backend container (preserving llama.cpp build)..."
+	@echo "Stopping backend container..."
+	$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) stop backend
+	@echo "Removing backend container..."
+	$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) rm -f backend
+	@echo "Building backend image with build-arg to skip llama.cpp build..."
+ifeq ($(wildcard .gpu_selected),)
+	@echo "Using CPU configuration (docker-compose.yml)..."
+else
+	@echo "Using GPU configuration (docker-compose-gpu.yml)..."
+endif
+	$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build --build-arg SKIP_LLAMA_BUILD=true backend || { echo "$(COLOR_RED)❌ Backend build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
+	@echo "Starting backend container..."
+	$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) up -d backend
+	@echo "Backend container smart-restarted successfully"
+	@echo "Check CUDA support with: make docker-check-cuda"

 docker-restart-frontend:
-	$(DOCKER_COMPOSE_CMD) stop frontend
-	$(DOCKER_COMPOSE_CMD) rm -f frontend
-	$(DOCKER_COMPOSE_CMD) build frontend || { echo "$(COLOR_RED)❌ Frontend build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
-	$(DOCKER_COMPOSE_CMD) up -d frontend
+	$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) stop frontend
+	$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) rm -f frontend
+	$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build frontend || { echo "$(COLOR_RED)❌ Frontend build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
+	$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) up -d frontend

 docker-restart-all:
-	$(DOCKER_COMPOSE_CMD) stop
-	$(DOCKER_COMPOSE_CMD) rm -f
-	$(DOCKER_COMPOSE_CMD) build || { echo "$(COLOR_RED)❌ Build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
-	$(DOCKER_COMPOSE_CMD) up -d
+	$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) stop
+	$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) rm -f
+	$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build || { echo "$(COLOR_RED)❌ Build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
+	$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) up -d
+
+# New command to check CUDA support in containers
+docker-check-cuda:
+	@echo "Checking CUDA support in Docker containers..."
+ifeq ($(WINDOWS),1)
+	@echo Running CUDA support check in backend container
+	@docker exec second-me-backend /app/check_gpu_support.sh || echo No GPU support detected in backend container
+else
+	@echo "$(COLOR_CYAN)Running CUDA support check in backend container:$(COLOR_RESET)"
+	@docker exec second-me-backend /app/check_gpu_support.sh || echo "$(COLOR_RED)No GPU support detected in backend container$(COLOR_RESET)"
+endif

 install:
 	poetry install
--- a/docker-compose-gpu.yml
+++ b/docker-compose-gpu.yml
@ -0,0 +1,74 @@
+services:
+  backend:
+    build:
+      context: .
+      dockerfile: ${DOCKER_BACKEND_DOCKERFILE:-Dockerfile.backend.cuda}
+    container_name: second-me-backend
+    restart: unless-stopped
+    ports:
+      - "8002:8002"
+      - "8080:8080"
+    volumes:
+      - ./data:/app/data
+      - ./logs:/app/logs
+      - ./run:/app/run
+      - ./resources:/app/resources
+      - ./docker:/app/docker
+      - ./.env:/app/.env
+      - llama-cpp-build:/app/llama.cpp/build  # Persist the llama.cpp build
+    environment:
+      # Environment variables
+      - LOCAL_APP_PORT=8002
+      - IN_DOCKER_ENV=1
+      - PLATFORM=${PLATFORM:-linux}
+      - USE_CUDA=1
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    deploy:
+      resources:
+        limits:
+          # Set container memory limit to 64GB
+          memory: 64G
+        reservations:
+          # Memory reservation
+          memory: 6G
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    networks:
+      - second-me-network
+
+  frontend:
+    build:
+      context: .
+      dockerfile: Dockerfile.frontend
+    container_name: second-me-frontend
+    restart: unless-stopped
+    ports:
+      - "3000:3000"
+    volumes:
+      - ./logs:/app/logs
+      - ./resources:/app/resources
+    environment:
+      - VITE_API_BASE_URL=http://backend:8002
+    depends_on:
+      - backend
+    deploy:
+      resources:
+        limits:
+          # Set container memory limit to 2GB
+          memory: 2G
+        reservations:
+          # Memory reservation
+          memory: 1G
+    networks:
+      - second-me-network
+
+networks:
+  second-me-network:
+    driver: bridge
+
+volumes:
+  llama-cpp-build:
+    driver: local
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -15,6 +15,7 @@ services:
      - ./resources:/app/resources
      - ./docker:/app/docker
      - ./.env:/app/.env
+      - llama-cpp-build:/app/llama.cpp/build  # Persist the llama.cpp build
    environment:
      # Environment variables
      - LOCAL_APP_PORT=8002
@ -62,3 +63,7 @@ services:
 networks:
  second-me-network:
    driver: bridge
+
+volumes:
+  llama-cpp-build:
+    driver: local
--- a/docker/app/check_gpu_support.sh
+++ b/docker/app/check_gpu_support.sh
@ -0,0 +1,57 @@
+#!/bin/bash
+# Helper script to check if GPU support is available at runtime
+
+echo "=== GPU Support Check ==="
+
+# Check if llama-server binary exists and is linked to CUDA libraries
+if [ -f "/app/llama.cpp/build/bin/llama-server" ]; then
+    echo "llama-server binary found, checking for CUDA linkage..."
+    CUDA_LIBS=$(ldd /app/llama.cpp/build/bin/llama-server | grep -i "cuda\|nvidia")
+    
+    if [ -n "$CUDA_LIBS" ]; then
+        echo "✅ llama-server is built with CUDA support:"
+        echo "$CUDA_LIBS"
+        echo "GPU acceleration is available"
+        
+        # Check for GPU optimization marker file (optional, not required)
+        GPU_MARKER_FILE="/app/data/gpu_optimized.json"
+        if [ -f "$GPU_MARKER_FILE" ]; then
+            GPU_OPTIMIZED=$(grep -o '"gpu_optimized": *true' "$GPU_MARKER_FILE" || echo "false")
+            OPTIMIZED_DATE=$(grep -o '"optimized_on": *"[^"]*"' "$GPU_MARKER_FILE" | cut -d'"' -f4)
+            
+            if [[ "$GPU_OPTIMIZED" == *"true"* ]]; then
+                echo "📝 GPU-optimized build marker found (built on: $OPTIMIZED_DATE)"
+            else
+                echo "📝 GPU marker file found but not marked as optimized (built on: $OPTIMIZED_DATE)"
+            fi
+        else
+            echo "📝 No GPU optimization marker file found, but CUDA support is detected in binary"
+        fi
+        
+        # Check if NVIDIA GPU is accessible at runtime
+        if nvidia-smi &>/dev/null; then
+            echo "🔍 NVIDIA GPU is available at runtime"
+            echo "=== GPU ACCELERATION IS READY TO USE ==="
+            exit 0
+        else
+            echo "⚠️ WARNING: llama-server has CUDA support, but NVIDIA GPU is not accessible"
+            echo "Check that Docker is running with GPU access (--gpus all)"
+            exit 1
+        fi
+    else
+        echo "❌ llama-server is not linked with CUDA libraries"
+        echo "Container was built without CUDA support"
+    fi
+else
+    echo "❌ llama-server binary not found at /app/llama.cpp/build/bin/llama-server"
+fi
+
+# Final check for GPU hardware
+if nvidia-smi &>/dev/null; then
+    echo "🔍 NVIDIA GPU is available at runtime, but llama-server doesn't support CUDA"
+    echo "To enable GPU support, rebuild using: make docker-up (and select CUDA support when prompted)"
+    exit 1
+else
+    echo "❌ No NVIDIA GPU detected at runtime"
+    exit 1
+fi
--- a/docker/app/check_torch_cuda.py
+++ b/docker/app/check_torch_cuda.py
@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+
+import torch
+import subprocess
+import sys
+import os
+
+print("=== PyTorch CUDA Version Information ===")
+print(f"PyTorch version: {torch.__version__}")
+
+if torch.cuda.is_available():
+    print(f"CUDA available: Yes")
+    print(f"CUDA version used by PyTorch: {torch.version.cuda}")
+    print(f"cuDNN version: {torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else 'Not available'}")
+    print(f"GPU device name: {torch.cuda.get_device_name(0)}")
+    
+    # Try to check system CUDA version
+    try:
+        nvcc_output = subprocess.check_output(["nvcc", "--version"]).decode("utf-8")
+        print("\nSystem NVCC version:")
+        print(nvcc_output)
+    except:
+        print("\nNVCC not found in PATH")
+        
+    # Check CUDA libraries
+    try:
+        print("\nChecking required CUDA libraries:")
+        for lib in ["libcudart.so", "libcublas.so", "libcublasLt.so"]:
+            print(f"\nSearching for {lib}:")
+            find_result = subprocess.run(f"find /usr -name '{lib}*'", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            if find_result.returncode == 0 and find_result.stdout:
+                print(find_result.stdout.decode("utf-8"))
+            else:
+                print(f"No {lib} found in /usr")
+    except Exception as e:
+        print(f"Error checking libraries: {e}")
+        
+    # Check LD_LIBRARY_PATH 
+    print("\nLD_LIBRARY_PATH:")
+    print(os.environ.get("LD_LIBRARY_PATH", "Not set"))
+    
+else:
+    print("CUDA not available")
+    
+# Check system CUDA installation
+print("\n=== System CUDA Information ===")
+try:
+    nvidia_smi = subprocess.check_output(["nvidia-smi"]).decode("utf-8")
+    print("NVIDIA-SMI output:")
+    print(nvidia_smi)
+except:
+    print("nvidia-smi not found or not working")
--- a/docker/app/rebuild_llama_cuda.sh
+++ b/docker/app/rebuild_llama_cuda.sh
@ -0,0 +1,132 @@
+#!/bin/bash
+# Script to rebuild llama.cpp with CUDA support at runtime
+# This ensures the build happens with full knowledge of the GPU environment
+
+set -e  # Exit on error but don't print each command (for cleaner logs)
+cd /app
+
+echo "========== STARTING LLAMA.CPP CUDA REBUILD PROCESS =========="
+echo "Current directory: $(pwd)"
+
+# First check if CUDA is actually available in the container
+echo "Verifying NVIDIA drivers and CUDA availability..."
+if ! command -v nvidia-smi &> /dev/null; then
+    echo "WARNING: NVIDIA drivers not found. Cannot build with CUDA support!"
+    echo "Make sure the container has access to the GPU and NVIDIA Container Toolkit is installed."
+    echo "Consider running Docker with: --gpus all"
+    exit 0  # Exit without error as there's no point trying to build with CUDA when no GPU is detected
+fi
+
+# Run nvidia-smi to check GPU access
+echo "Detected NVIDIA GPU:"
+nvidia-smi || {
+    echo "ERROR: nvidia-smi command failed. GPU is not properly accessible from the container."
+    echo "Make sure you're running Docker with GPU access enabled (--gpus all)"
+    exit 0  # Exit without error since there's no GPU access
+}
+
+# Install build dependencies
+echo "Installing build dependencies..."
+apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    wget \
+    cmake \
+    git \
+    ca-certificates \
+    gnupg \
+    libopenblas-dev
+
+# Clean up apt cache to free space
+apt-get clean
+rm -rf /var/lib/apt/lists/*
+
+# Install CUDA using NVIDIA's official Debian 12 network installation method
+echo "Installing CUDA using NVIDIA's official method for Debian 12..."
+wget https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb
+dpkg -i cuda-keyring_1.1-1_all.deb
+rm cuda-keyring_1.1-1_all.deb
+apt-get update
+
+# Install CUDA packages needed for building llama.cpp with CUDA support
+apt-get install -y --fix-missing --no-install-recommends cuda-compiler-12-8
+apt-get clean
+rm -rf /var/lib/apt/lists/*
+
+apt-get update
+apt-get install -y --fix-missing --no-install-recommends cuda-runtime-12-8
+apt-get clean
+rm -rf /var/lib/apt/lists/*
+
+apt-get update
+apt-get install -y --fix-missing --no-install-recommends cuda-libraries-dev-12-8
+apt-get clean
+rm -rf /var/lib/apt/lists/*
+
+# Set up environment for build
+export PATH=/usr/local/cuda-12.8/bin:${PATH}
+export LD_LIBRARY_PATH=/usr/local/cuda-12.8/lib64:${LD_LIBRARY_PATH}
+export CUDA_HOME=/usr/local/cuda-12.8
+# Set CUDACXX environment variable explicitly to help CMake find the CUDA compiler
+export CUDACXX=/usr/local/cuda-12.8/bin/nvcc
+export CMAKE_CUDA_COMPILER=/usr/local/cuda-12.8/bin/nvcc
+
+# Verify CUDA compiler is available
+echo "Verifying CUDA compiler (nvcc) is available:"
+which nvcc || echo "ERROR: nvcc not found in PATH!"
+nvcc --version || echo "ERROR: nvcc not working properly!"
+
+echo "CUDA environment:"
+echo "- CUDA_HOME: $CUDA_HOME"
+echo "- CUDACXX: $CUDACXX"
+echo "- CMAKE_CUDA_COMPILER: $CMAKE_CUDA_COMPILER"
+echo "- PATH includes CUDA: $PATH"
+echo "- LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
+
+# Show available disk space
+echo "Available disk space:"
+df -h
+
+# Use local build approach to avoid volume mount issues
+echo "Building llama.cpp with CUDA in a local directory..."
+cd /tmp
+rm -rf llama_build
+mkdir -p llama_build
+cd llama_build
+
+# Clone a fresh copy of llama.cpp - this avoids volume mount issues
+echo "Cloning fresh copy of llama.cpp..."
+git clone https://github.com/ggerganov/llama.cpp.git .
+
+# Configure and build with CUDA support
+mkdir -p build
+cd build
+echo "Configuring with CMake..."
+cmake -DGGML_CUDA=ON \
+      -DCMAKE_CUDA_ARCHITECTURES=all \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DBUILD_SHARED_LIBS=OFF \
+      -DLLAMA_NATIVE=OFF \
+      -DCMAKE_CUDA_FLAGS="-Wno-deprecated-gpu-targets" \
+      ..
+
+echo "Building llama.cpp with CUDA support..."
+cmake --build . --config Release --target all -j $(nproc)
+
+if [ -f "bin/llama-server" ]; then
+    echo "Build successful! Copying binaries to /app/llama.cpp/build/bin/"
+    mkdir -p /app/llama.cpp/build/bin
+    cp bin/llama-server /app/llama.cpp/build/bin/
+    cp bin/llama-cli /app/llama.cpp/build/bin/ 2>/dev/null || true
+    chmod +x /app/llama.cpp/build/bin/llama-server /app/llama.cpp/build/bin/llama-cli
+    
+    # Create GPU optimized marker
+    echo "{ \"gpu_optimized\": true, \"optimized_on\": \"$(date -u +\"%Y-%m-%dT%H:%M:%SZ\")\" }" > /app/data/gpu_optimized.json
+    
+    echo "Testing CUDA support in built binary..."
+    LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH /app/llama.cpp/build/bin/llama-server --version
+    echo ""
+    echo "========== CUDA BUILD COMPLETED SUCCESSFULLY =========="
+else
+    echo "ERROR: Build failed - llama-server executable not found!"
+    exit 1
+fi
--- a/logs/logs.lnk
+++ b/logs/logs.lnk
--- a/lpm_frontend/src/app/dashboard/train/training/page.tsx
+++ b/lpm_frontend/src/app/dashboard/train/training/page.tsx
@ -4,7 +4,7 @@ import { useState, useEffect, useRef } from 'react';
 import { useRouter } from 'next/navigation';
 import InfoModal from '@/components/InfoModal';
 import type { TrainingConfig } from '@/service/train';
-import { startTrain, stopTrain, retrain, getTrainingParams, resetProgress } from '@/service/train';
+import { startTrain, stopTrain, retrain, getTrainingParams, checkCudaAvailability, resetProgress } from '@/service/train';
 import { useTrainingStore } from '@/store/useTrainingStore';
 import { getMemoryList } from '@/service/memory';
 import { message, Modal } from 'antd';
@ -90,6 +90,7 @@ export default function TrainingPage() {
  const firstLoadRef = useRef<boolean>(true);
  const pollingStopRef = useRef<boolean>(false);

+  const [cudaAvailable, setCudaAvailable] = useState<boolean>(false);
  const [isResume, setIsResume] = useState(
    trainingProgress.status === 'suspended' || trainingProgress.status === 'failed'
  );
@ -104,6 +105,29 @@ export default function TrainingPage() {
    fetchModelConfig();
  }, []);

+  useEffect(() => {
+    // Check CUDA availability once on load
+    checkCudaAvailability()
+      .then(res => {
+        if (res.data.code === 0) {
+          const { cuda_available, cuda_info } = res.data.data;
+          setCudaAvailable(cuda_available);
+          
+          if (cuda_available) {
+            console.log('CUDA is available:', cuda_info);
+          } else {
+            console.log('CUDA is not available on this system');
+          }
+        } else {
+          message.error(res.data.message || 'Failed to check CUDA availability');
+        }
+      })
+      .catch(err => {
+        console.error('CUDA availability check failed', err);
+        message.error('CUDA availability check failed');
+      });
+  }, []);
+
  // Start polling training progress
  const startPolling = () => {
    if (pollingStopRef.current) {
@ -280,39 +304,23 @@ export default function TrainingPage() {
    const eventSource = new EventSource('/api/trainprocess/logs');

    eventSource.onmessage = (event) => {
-      try {
-        const data = JSON.parse(event.data);
+      // Don't try to parse as JSON, just use the raw text data directly
+      const logMessage = event.data;
+      
+      setTrainingDetails((prev) => {
+        const newLogs = [
+          ...prev.slice(-500), // Keep more log entries (500 instead of 100)
+          {
+            message: logMessage,
+            timestamp: new Date().toLocaleTimeString([], { hour: '2-digit', minute: '2-digit', second: '2-digit' })
+          }
+        ];

-        setTrainingDetails((prev) => {
-          const newLogs = [
-            ...prev.slice(-100),
-            {
-              message: data.message,
-              timestamp: new Date().toISOString()
-            }
-          ];
+        // Save logs to localStorage for persistence between page refreshes
+        localStorage.setItem('trainingLogs', JSON.stringify(newLogs));

-          // Save logs to localStorage
-          // localStorage.setItem('trainingLogs', JSON.stringify(newLogs));
-
-          return newLogs;
-        });
-      } catch {
-        setTrainingDetails((prev) => {
-          const newLogs = [
-            ...prev.slice(-100),
-            {
-              message: event.data,
-              timestamp: new Date().toISOString()
-            }
-          ];
-
-          // Save logs to localStorage
-          // localStorage.setItem('trainingLogs', JSON.stringify(newLogs));
-
-          return newLogs;
-        });
-      }
+        return newLogs;
+      });
    };

    eventSource.onerror = (error) => {
@ -535,6 +543,7 @@ export default function TrainingPage() {
          trainActionLoading={trainActionLoading}
          trainingParams={trainingParams}
          updateTrainingParams={updateTrainingParams}
+          cudaAvailable={cudaAvailable}
        />

        {/* Only show training progress after training starts */}
--- a/lpm_frontend/src/components/train/TrainingConfiguration.tsx
+++ b/lpm_frontend/src/components/train/TrainingConfiguration.tsx
@ -26,6 +26,14 @@ interface ModelConfig {
  [key: string]: any;
 }

+interface TrainingParams {
+  data_synthesis_mode: string;
+  learning_rate: number;
+  number_of_epochs: number;
+  concurrency_threads: number;
+  use_cuda: boolean;
+}
+
 interface TrainingConfigurationProps {
  baseModelOptions: BaseModelOption[];
  modelConfig: ModelConfig | null;
@ -40,6 +48,7 @@ interface TrainingConfigurationProps {
  trainActionLoading: boolean;
  setSelectedInfo: React.Dispatch<React.SetStateAction<boolean>>;
  trainingParams: TrainingConfig;
+  cudaAvailable: boolean;
 }

 const synthesisModeOptions = [
@ -61,7 +70,8 @@ const TrainingConfiguration: React.FC<TrainingConfigurationProps> = ({
  changeBaseModel,
  trainActionLoading,
  handleTrainingAction,
-  setSelectedInfo
+  setSelectedInfo,
+  cudaAvailable
 }) => {
  const [disabledChangeParams, setDisabledChangeParams] = useState<boolean>(false);
  const [openThinkingModel, setOpenThinkingModel] = useState<boolean>(false);
@ -394,6 +404,47 @@ const TrainingConfiguration: React.FC<TrainingConfigurationProps> = ({
                  Enter an integer between 1 and 10 (recommended: 2)
                </div>
              </div>
+
+              <div className="flex flex-col gap-2 mt-4">
+                <div className="flex gap-3 items-center">
+                  <div className="font-medium">Enable CUDA GPU Acceleration</div>
+                  <Tooltip title="When enabled, training will use CUDA GPU acceleration if available on your system. This can significantly speed up training but requires compatible NVIDIA hardware and drivers.">
+                    <QuestionCircleOutlined className="cursor-pointer" />
+                  </Tooltip>
+                </div>
+                <div className="flex items-center">
+                  <label className="inline-flex items-center cursor-pointer">
+                    <input
+                      type="checkbox"
+                      checked={
+                        disabledChangeParams && nowTrainingParams && !changeBaseModel
+                          ? nowTrainingParams.use_cuda
+                          : trainingParams.use_cuda
+                      }
+                      className="sr-only peer"
+                      disabled={disabledChangeParams || !cudaAvailable}
+                      onChange={(e) => {
+                        updateTrainingParams({ ...trainingParams, use_cuda: e.target.checked });
+                      }}
+                    />
+                    <div className={`relative w-11 h-6 ${!cudaAvailable ? 'bg-gray-300' : 'bg-gray-200'} peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-blue-300 rounded-full peer peer-checked:after:translate-x-full rtl:peer-checked:after:-translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:start-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all ${cudaAvailable ? 'peer-checked:bg-blue-600' : 'peer-checked:bg-gray-400'}`}></div>
+                    <span className={`ms-3 text-sm font-medium ${!cudaAvailable ? 'text-gray-500' : 'text-gray-700'}`}>
+                      {disabledChangeParams && nowTrainingParams && !changeBaseModel
+                        ? nowTrainingParams.use_cuda
+                          ? 'Enabled'
+                          : 'Disabled'
+                        : trainingParams.use_cuda
+                        ? 'Enabled'
+                        : 'Disabled'}
+                    </span>
+                  </label>
+                </div>
+                <div className="text-xs text-gray-500">
+                  {cudaAvailable 
+                    ? 'Enable for faster training on NVIDIA GPUs.'
+                    : 'CUDA acceleration is not available on this system.'}
+                </div>
+              </div>
            </div>
          </div>

--- a/lpm_frontend/src/components/train/TrainingLog.tsx
+++ b/lpm_frontend/src/components/train/TrainingLog.tsx
@ -11,6 +11,7 @@ const TrainingLog: React.FC<TrainingLogProps> = ({ trainingDetails }: TrainingLo
  const consoleEndRef = useRef<HTMLDivElement>(null);
  const [isUserScrolling, setIsUserScrolling] = useState(false);
  const userScrollTimeout = useRef<NodeJS.Timeout | null>(null);
+  const [isAutoScrollEnabled, setIsAutoScrollEnabled] = useState(true);

  // Smooth scroll console to bottom
  const smoothScrollConsole = () => {
@ -29,17 +30,37 @@ const TrainingLog: React.FC<TrainingLogProps> = ({ trainingDetails }: TrainingLo
  useEffect(() => {
    // Set up scroll event listener to detect user scrolling
    const handleUserScroll = () => {
-      setIsUserScrolling(true);
+      if (!consoleEndRef.current) return;
+      
+      const consoleContainer = consoleEndRef.current.closest('.overflow-y-auto');
+      
+      if (!(consoleContainer instanceof HTMLElement)) return;
+      
+      // Check if scrolled away from bottom
+      const isScrolledToBottom = 
+        Math.abs((consoleContainer.scrollHeight - consoleContainer.scrollTop) - consoleContainer.clientHeight) < 50;
+      
+      // If scrolled away from bottom, consider it manual scrolling
+      if (!isScrolledToBottom) {
+        setIsUserScrolling(true);

-      // Clear any existing timeout
-      if (userScrollTimeout.current) {
-        clearTimeout(userScrollTimeout.current);
-      }
+        // Clear any existing timeout
+        if (userScrollTimeout.current) {
+          clearTimeout(userScrollTimeout.current);
+        }

-      // Reset the flag after a short delay
-      userScrollTimeout.current = setTimeout(() => {
+        // Reset the flag after a delay
+        userScrollTimeout.current = setTimeout(() => {
+          setIsUserScrolling(false);
+        }, 5000); // 5 seconds delay before allowing auto-scroll again
+      } else {
+        // If at bottom, not considered manual scrolling
        setIsUserScrolling(false);
-      }, 2000); // 2 seconds delay before allowing auto-scroll again
+        if (userScrollTimeout.current) {
+          clearTimeout(userScrollTimeout.current);
+          userScrollTimeout.current = null;
+        }
+      }
    };

    // Find the console container and attach the scroll listener
@ -65,7 +86,16 @@ const TrainingLog: React.FC<TrainingLogProps> = ({ trainingDetails }: TrainingLo
    if (trainingDetails.length > 0) {
      smoothScrollConsole();
    }
-  }, [trainingDetails]);
+  }, [trainingDetails, isAutoScrollEnabled]);
+
+  const toggleAutoScroll = () => {
+    setIsAutoScrollEnabled(!isAutoScrollEnabled);
+    if (!isAutoScrollEnabled) {
+      // If we're re-enabling auto-scroll, scroll to bottom immediately
+      setIsUserScrolling(false);
+      setTimeout(smoothScrollConsole, 50);
+    }
+  };

  return (
    <div className="mt-4">
--- a/lpm_frontend/src/service/train.ts
+++ b/lpm_frontend/src/service/train.ts
@ -138,3 +138,17 @@ export const getTrainingParams = () => {
    url: `/api/trainprocess/training_params`
  });
 };
+
+export const checkCudaAvailability = () => {
+  return Request<CommonResponse<{
+    cuda_available: boolean;
+    cuda_info: {
+      device_count?: number;
+      current_device?: number;
+      device_name?: string;
+    };
+  }>>({
+    method: 'get',
+    url: '/api/kernel2/cuda/available'
+  });
+};
--- a/lpm_kernel/L1/shade_generator.py
+++ b/lpm_kernel/L1/shade_generator.py
@ -169,6 +169,18 @@ Domain Timelines:
        content = response.choices[0].message.content
        shift_pattern = r"\{.*\}"
        shift_perspective_result = self.__parse_json_response(content, shift_pattern)
+        
+        # Check if result is None and provide default values to avoid TypeError
+        if shift_perspective_result is None:
+            logger.warning(f"Failed to parse perspective shift result, using default values: {content}")
+            # Create a default mapping with expected parameters
+            shift_perspective_result = {
+                "domainDesc": f"You have knowledge and experience related to {shade_info.name}.",
+                "domainContent": shade_info.content_third_view,
+                "domainTimeline": []
+            }
+            
+        # Now it's safe to pass shift_perspective_result as kwargs
        shade_info.add_second_view(**shift_perspective_result)
        return shade_info

--- a/lpm_kernel/L2/data_pipeline/data_prep/preference/preference_QA_generate.py
+++ b/lpm_kernel/L2/data_pipeline/data_prep/preference/preference_QA_generate.py
@ -53,8 +53,18 @@ class PreferenceQAGenerator:
            bio: Biography or context information to use in prompt generation.
            preference_language: Language for prompts ("Chinese/中文" or otherwise English).
        """
+        # Ensure the filename is actually a string
+        if filename is None:
+            raise ValueError("Filename cannot be None")
+            
        self.filename = filename
-        self.is_cot = is_cot
+        # Convert is_cot to bool if it's a string
+        if isinstance(is_cot, str):
+            self.is_cot = is_cot.lower() == 'true'
+        else:
+            self.is_cot = bool(is_cot)
+            
+        logger.info(f"PreferenceQAGenerator initialized with is_cot={self.is_cot}")
        
        with open(self.filename, "r", encoding="utf-8") as f:
            self.pre_msg = json.load(f)
--- a/lpm_kernel/L2/download_model.sh
+++ b/lpm_kernel/L2/download_model.sh
@ -1 +1,12 @@
-python lpm_kernel/L2/utils.py Qwen2.5-0.5B-Instruct
+#!/bin/bash
+# Script to download model from Hugging Face
+# Usage: ./download_model.sh [model_name]
+#   If no model name is provided, will attempt to get it from config
+
+if [ "$1" != "" ]; then
+    # Use provided model name
+    python lpm_kernel/L2/utils.py "$1"
+else
+    # No model name provided, let utils.py determine from config
+    python lpm_kernel/L2/utils.py
+fi
--- a/lpm_kernel/L2/l2_generator.py
+++ b/lpm_kernel/L2/l2_generator.py
@ -35,11 +35,19 @@ class L2Generator:
        Args:
            data_path: Path to the raw data directory. Defaults to "../raw_data".
            preferred_lang: Preferred language for data processing. Defaults to "English".
+            is_cot: Whether to use Chain of Thought reasoning. Can be bool or string.
        """
        self.data_path = data_path
        self.data_processor = L2DataProcessor(data_path, preferred_lang)
        self.preferred_lang = preferred_lang
-        self.is_cot = is_cot
+        
+        # Convert is_cot to bool if it's a string
+        if isinstance(is_cot, str):
+            self.is_cot = is_cot.lower() == 'true'
+        else:
+            self.is_cot = bool(is_cot)
+            
+        logging.info(f"L2Generator initialized with is_cot={self.is_cot}")
        
    def data_preprocess(self, note_list: List[Note], basic_info: Dict):
        """Preprocess the input notes and basic information.
@ -60,39 +68,50 @@ class L2Generator:
        graph_path: str,
        config_path: str,
    ):
-        """Generate subjective data based on input notes and user information.
+        """Generate subjective data for personalization.
+        
+        This method orchestrates the generation of subjective data including preferences,
+        diversity, self-Q&A data, and graph indexing.
        
        Args:
-            note_list: List of Note objects.
-            basic_info: Dictionary containing basic user information.
+            note_list: List of Note objects to process.
+            basic_info: Dictionary containing user information.
            data_output_base_dir: Base directory for output data.
            topics_path: Path to topics data.
-            entities_path: Path to entities data.
+            entities_path: Path to entity data.
            graph_path: Path to graph data.
            config_path: Path to configuration file.
        """
-        global_bio = basic_info["globalBio"]
-        user_name = basic_info["username"]
-        user_intro = basic_info["aboutMe"]
+        if not os.path.exists(data_output_base_dir):
+            os.makedirs(data_output_base_dir)

-        preference_output_path = "preference.json"
-        diversity_output_path = "diversity.json"
-        selfqa_output_path = "selfqa.json"
+        # Check if the file exists
+        if not os.path.exists(topics_path):
+            # Create an empty file
+            with open(topics_path, "w") as f:
+                f.write(json.dumps([]))

+        # Generate subjective data
        self.data_processor.gen_subjective_data(
-            note_list,
-            data_output_base_dir,
-            preference_output_path,
-            diversity_output_path,
-            selfqa_output_path,
-            global_bio,
-            topics_path,
-            entities_path,
-            graph_path,
-            user_name,
-            config_path,
-            user_intro,
+            note_list=note_list,
+            data_output_base_dir=data_output_base_dir,
+            preference_output_path="preference.json",
+            diversity_output_path="diversity.json",
+            selfqa_output_path="selfqa.json",
+            global_bio=basic_info["globalBio"],
+            topics_path=topics_path,
+            entitys_path=entities_path,
+            graph_path=graph_path,
+            user_name=basic_info["username"],
+            config_path=config_path,
+            user_intro=basic_info["aboutMe"],
        )
+        
+        # Merge JSON files for training
+        self.merge_json_files(data_output_base_dir)
+        
+        # Release Ollama models from memory after data synthesis is complete
+        self._release_ollama_models()

    def gen_preference_data(
        self,
@ -187,6 +206,20 @@ class L2Generator:
        with open(merged_output_path, 'w', encoding='utf-8') as f:
            json.dump(merged_data, f, ensure_ascii=False, indent=2)
    
+    def _release_ollama_models(self):
+        """Release Ollama models from memory to free up VRAM for training.
+        
+        This method calls the release function defined in the train module.
+        It's important to release models after data synthesis and before training
+        to ensure VRAM is properly freed.
+        """
+        try:
+            from lpm_kernel.L2.train import release_ollama_models
+            release_ollama_models()
+        except Exception as e:
+            import logging
+            logging = logging.getLogger(__name__)
+            logging.warning(f"Failed to release Ollama models: {str(e)}")

    def clean_graphrag_keys(self):
        GRAPH_CONFIG = os.path.join(
--- a/lpm_kernel/L2/memory_manager.py
+++ b/lpm_kernel/L2/memory_manager.py
@ -0,0 +1,149 @@
+"""Memory management utilities for PyTorch training.
+
+This module provides lightweight utilities to monitor memory usage
+and configure PyTorch's built-in memory management features.
+"""
+
+import os
+import gc
+import logging
+import psutil
+import torch
+from typing import Dict, Any
+
+# Configure logging
+logger = logging.getLogger(__name__)
+
+class MemoryManager:
+    """Simple memory manager that leverages PyTorch's built-in memory optimizations."""
+    
+    def __init__(self):
+        """Initialize the memory manager."""
+        self.cuda_available = torch.cuda.is_available()
+        self.process = psutil.Process(os.getpid())
+        
+        # Remove redundant environment variable setting - now handled in train_for_user.sh
+    
+    def get_memory_info(self) -> Dict[str, Any]:
+        """Get current memory usage information."""
+        info = {
+            "ram_used_percent": psutil.virtual_memory().percent,
+            "ram_used_gb": psutil.virtual_memory().used / (1024**3),
+            "ram_available_gb": psutil.virtual_memory().available / (1024**3),
+            "ram_total_gb": psutil.virtual_memory().total / (1024**3),
+        }
+        
+        if self.cuda_available:
+            try:
+                info.update({
+                    "vram_used_gb": torch.cuda.memory_allocated() / (1024**3),
+                    "vram_reserved_gb": torch.cuda.memory_reserved() / (1024**3),
+                    "vram_total_gb": torch.cuda.get_device_properties(0).total_memory / (1024**3),
+                })
+            except RuntimeError as e:
+                logger.warning(f"Error getting CUDA memory info: {str(e)}")
+                self.cuda_available = False
+        
+        return info
+    
+    def cleanup_memory(self, force: bool = False) -> None:
+        """Free up memory by garbage collection and emptying CUDA cache."""
+        # Run Python garbage collection
+        gc.collect()
+        
+        # Empty CUDA cache if available
+        if self.cuda_available:
+            torch.cuda.empty_cache()
+            
+        # Log memory status after cleanup
+        if force:
+            info = self.get_memory_info()
+            logger.info(
+                f"Memory after cleanup: RAM: {info['ram_used_gb']:.2f}GB / {info['ram_total_gb']:.2f}GB, "
+                f"VRAM: {info.get('vram_used_gb', 0):.2f}GB / {info.get('vram_total_gb', 0):.2f}GB"
+            )
+    
+    def get_optimal_training_config(self) -> Dict[str, Any]:
+        """Get recommended configurations for model training based on hardware capabilities."""
+        # Default configs that rely on PyTorch's automatic memory management
+        config = {
+            "device_map": "auto",
+            "fp16": False,
+            "bf16": False,
+            "gradient_checkpointing": True,
+            "gradient_accumulation_steps": 1,
+        }
+        
+        # Enable mixed precision based on hardware support
+        if self.cuda_available:
+            capability = torch.cuda.get_device_capability()
+            if capability[0] >= 8:  # Ampere or newer (supports BF16)
+                config["bf16"] = True
+            elif capability[0] >= 7:  # Volta or newer (supports FP16)
+                config["fp16"] = True
+                
+            # Adjust accumulation steps based on available memory
+            vram_gb = self.get_memory_info().get("vram_total_gb", 0)
+            if vram_gb < 8:  # Small GPUs
+                config["gradient_accumulation_steps"] = 4
+            elif vram_gb < 16:  # Medium GPUs
+                config["gradient_accumulation_steps"] = 2
+        
+        return config
+    
+    def optimize_model_for_training(self, model):
+        """Apply PyTorch's built-in memory optimizations for training."""
+        # Enable gradient checkpointing if available
+        if hasattr(model, "gradient_checkpointing_enable"):
+            logger.info("Enabling gradient checkpointing for memory efficiency")
+            model.gradient_checkpointing_enable()
+            
+        # Enable memory-efficient attention for PyTorch 2.0+
+        if hasattr(model, "config"):
+            try:
+                model.config.use_memory_efficient_attention = True
+            except:
+                pass
+            
+            # Enable flash attention for compatible GPUs
+            if self.cuda_available and torch.cuda.get_device_capability()[0] >= 8:
+                try:
+                    model.config.attn_implementation = "flash_attention_2"
+                except:
+                    pass
+        
+        return model
+    
+    def optimize_training_args(self, training_args):
+        """Configure training arguments for efficient memory usage."""
+        if not training_args:
+            return None
+            
+        # Get optimal configuration based on hardware
+        config = self.get_optimal_training_config()
+        
+        # Apply configurations to training arguments
+        if not getattr(training_args, "fp16", False) and not getattr(training_args, "bf16", False):
+            training_args.fp16 = config["fp16"]
+            training_args.bf16 = config["bf16"]
+        
+        if not getattr(training_args, "gradient_checkpointing", False):
+            training_args.gradient_checkpointing = config["gradient_checkpointing"]
+        
+        if training_args.gradient_accumulation_steps == 1:
+            training_args.gradient_accumulation_steps = config["gradient_accumulation_steps"]
+        
+        logger.info("Training configuration optimized for memory efficiency:")
+        logger.info(f"  Mixed precision: FP16={training_args.fp16}, BF16={training_args.bf16}")
+        logger.info(f"  Gradient checkpointing: {training_args.gradient_checkpointing}")
+        logger.info(f"  Gradient accumulation steps: {training_args.gradient_accumulation_steps}")
+        
+        return training_args
+
+
+# Global memory manager instance
+memory_manager = MemoryManager()
+
+def get_memory_manager() -> MemoryManager:
+    """Get the global memory manager instance."""
+    return memory_manager
--- a/lpm_kernel/L2/merge_lora_weights.py
+++ b/lpm_kernel/L2/merge_lora_weights.py
@ -6,36 +6,123 @@ LoRA architecture during inference.
 """

 import argparse
+import os
+import gc
+import sys
+import logging
+import traceback
+import torch
+import datetime
+from typing import Optional, Dict, Any

 from peft import PeftModel
 from transformers import AutoModelForCausalLM, AutoTokenizer
-import logging
+from lpm_kernel.L2.memory_manager import get_memory_manager
+
+# Configure logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)

 def merge_lora_weights(base_model_path, lora_adapter_path, output_model_path):
    """Merge LoRA weights into a base model and save the result.
    
    This function loads a base model and a LoRA adapter, merges them together,
-    and saves the resulting model to the specified output path.
+    and saves the resulting model to the specified output path. It leverages
+    PyTorch's built-in memory management features.
    
    Args:
        base_model_path: Path to the base model directory.
        lora_adapter_path: Path to the LoRA adapter directory.
        output_model_path: Path where the merged model will be saved.
    """
-    # Load the base model
-    logging.info(f"Loading base model from {base_model_path}")
-    base_model = AutoModelForCausalLM.from_pretrained(base_model_path)
-    tokenizer = AutoTokenizer.from_pretrained(base_model_path)
-
-    # Load the LoRA adapter and apply it to the base model
-    lora_model = PeftModel.from_pretrained(base_model, lora_adapter_path)
-
-    # Merge LoRA weights into the base model
-    merged_model = lora_model.merge_and_unload()
-
-    # Save the merged model and tokenizer
-    merged_model.save_pretrained(output_model_path)
-    tokenizer.save_pretrained(output_model_path)
+    # Get memory manager
+    memory_manager = get_memory_manager()
+    
+    try:
+        # Log initial memory state
+        memory_info = memory_manager.get_memory_info()
+        logger.info(f"Initial memory state: RAM used: {memory_info['ram_used_gb']:.2f}GB, "
+                   f"available: {memory_info['ram_available_gb']:.2f}GB")
+        
+        # Determine if CUDA is available and should be used
+        use_cuda = memory_manager.cuda_available
+        device = "cuda" if use_cuda else "cpu"
+        
+        if use_cuda:
+            logger.info(f"CUDA is available. VRAM used: {memory_info.get('vram_used_gb', 0):.2f}GB")
+        else:
+            logger.warning("CUDA not available or not enabled. Using CPU for model operations.")
+        
+        # Clean up memory before starting
+        memory_manager.cleanup_memory(force=True)
+        
+        # Explicitly set device configuration based on available hardware
+        device_map = "auto" if use_cuda else None
+        dtype = torch.float16 if use_cuda else torch.float32
+        
+        logger.info(f"Loading base model from {base_model_path} with device_map={device_map}, dtype={dtype}")
+        
+        # Use explicit configuration for GPU utilization
+        base_model = AutoModelForCausalLM.from_pretrained(
+            base_model_path,
+            torch_dtype=dtype,
+            device_map=device_map
+        )
+        
+        # Load tokenizer - this doesn't consume much memory
+        tokenizer = AutoTokenizer.from_pretrained(base_model_path)
+        
+        # Load the LoRA adapter and apply it to the base model
+        logger.info(f"Loading LoRA adapter from {lora_adapter_path}")
+        lora_model = PeftModel.from_pretrained(base_model, lora_adapter_path)
+        
+        # Merge weights - this is done automatically by PyTorch on appropriate devices
+        logger.info(f"Merging LoRA weights into base model on {device}")
+        merged_model = lora_model.merge_and_unload()
+        
+        # Clean up before saving
+        memory_manager.cleanup_memory()
+        
+        # Add inference optimization config to the merged model for faster startup
+        if use_cuda:
+            # Set inference-specific configuration in model config
+            if hasattr(merged_model.config, "torch_dtype"):
+                merged_model.config.torch_dtype = "float16"  # Prefer float16 for inference
+            if not hasattr(merged_model.config, "pretraining_tp"):
+                merged_model.config.pretraining_tp = 1  # For tensor parallelism during inference
+            
+            # Set default inference device
+            if not hasattr(merged_model.config, "_default_inference_device"):
+                merged_model.config._default_inference_device = "cuda:0"
+            
+            logger.info("Added GPU optimization settings to model configuration")
+        
+        # Save merged model with shard size to prevent OOM errors during save
+        logger.info(f"Saving merged model to {output_model_path}")
+        merged_model.save_pretrained(
+            output_model_path,
+            safe_serialization=True,
+            max_shard_size="2GB"  # Sharded saving to avoid memory spikes
+        )
+        tokenizer.save_pretrained(output_model_path)
+        
+        # Save a special marker file to indicate this model should use GPU for inference
+        if use_cuda:
+            with open(os.path.join(output_model_path, "gpu_optimized.json"), "w") as f:
+                import json
+                json.dump({"gpu_optimized": True, "optimized_on": datetime.datetime.now().isoformat()}, f)
+                logger.info("Added GPU optimization marker file for faster service startup")
+        
+        logger.info("Model successfully merged and saved!")
+        
+    except Exception as e:
+        logger.error(f"Error during model merge: {str(e)}")
+        logger.error(traceback.format_exc())
+        # Force cleanup
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        raise


 def merge_model_weights(
--- a/lpm_kernel/L2/train.py
+++ b/lpm_kernel/L2/train.py
@ -28,10 +28,13 @@ from lpm_kernel.L2.utils import (
    create_and_prepare_model,
    formatting_prompts_func,
    create_chat_data,
+    release_ollama_models_early,
 )
 from lpm_kernel.configs.logging import LOGGING_CONFIG
 import logging.config
 from lpm_kernel.configs.logging import get_train_process_logger
+from lpm_kernel.L2.memory_manager import get_memory_manager
+
 logger = get_train_process_logger()


@ -45,6 +48,25 @@ class LogTqdm(tqdm):
 # Replace the default tqdm
 sys.modules["tqdm"].tqdm = LogTqdm

+# Debug callback for logging training progress
+class DebugCallback(transformers.TrainerCallback):
+    def __init__(self):
+        self.total_time = 0
+        self.last_time = time.time()
+        
+    def on_step_end(self, args, state, control, **kwargs):
+        if state.global_step % 10 == 0:
+            current_time = time.time()
+            step_time = current_time - self.last_time
+            self.total_time += step_time
+            self.last_time = current_time
+            
+            # Log step time and training progress
+            logger.info(f"Step {state.global_step}: {step_time:.2f}s - Total training time: {self.total_time:.2f}s")
+            
+    def on_epoch_end(self, args, state, control, **kwargs):
+        logger.info(f"Epoch {state.epoch} completed")
+

@dataclass
 class ModelArguments:
@ -112,6 +134,10 @@ class ModelArguments:
        default=False,
        metadata={"help": "Enables UnSloth for training."},
    )
+    use_cuda: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables CUDA GPU acceleration for training and inference when available."},
+    )


@dataclass
@ -162,80 +188,117 @@ def main(model_args, data_args, training_args):
    for handler in logging.getLogger().handlers:
        handler.flush()

-    logger.info("start 1")
-    set_seed(training_args.seed)
-    logger.info("start 2")
-    # model
-    model, peft_config, tokenizer = create_and_prepare_model(
-        model_args, data_args, training_args
-    )
-    logger.info("start 3")
-    # gradient ckpt
-    model.config.use_cache = not training_args.gradient_checkpointing
-    training_args.gradient_checkpointing = (
-        training_args.gradient_checkpointing and not model_args.use_unsloth
-    )
-    logger.info("start 4")
-    if training_args.gradient_checkpointing:
-        training_args.gradient_checkpointing_kwargs = {
-            "use_reentrant": model_args.use_reentrant
-        }
-
-    # Configure system resources for optimal performance
-    def configure_system_resources(num_cores=None):
-        """
-        Configure system resources to optimize training performance
-        
-        Args:
-            num_cores: Number of CPU cores to use, if None, automatically detect
-        """
-        # Automatically detect available cores, if not specified
-        if num_cores is None:
-            num_cores = min(os.cpu_count(), 6)  # Limit to 6 cores, match Docker configuration
-        
-        logger.info(f"Configuring system to use {num_cores} CPU cores")
-        
-        # Set environment variables
-        os.environ["OMP_NUM_THREADS"] = str(num_cores)
-        os.environ["MKL_NUM_THREADS"] = str(num_cores)
-        os.environ["NUMEXPR_NUM_THREADS"] = str(num_cores)
-        
-        # Set PyTorch thread count
-        torch.set_num_threads(num_cores)
-        
-        # If supported, set PyTorch multi-thread optimization
-        if hasattr(torch, "set_num_interop_threads"):
-            torch.set_num_interop_threads(num_cores)
-        
-        # Enable memory-optimized garbage collection
-        # import gc
-        # gc.enable()
-        
-        # # Monitor memory usage and clean up periodically
-        # def schedule_gc():
-        #     gc.collect()
-        #     torch.cuda.empty_cache() if torch.cuda.is_available() else None
-        #     return schedule_gc
-        
-        # If CUDA is available, set CUDA device
-        if torch.cuda.is_available():
-            torch.cuda.set_device(0)
-            logger.info(f"CUDA is available. Using device: {torch.cuda.get_device_name(0)}")
-            # Display CUDA memory information
-            logger.info(f"CUDA memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
-            logger.info(f"CUDA memory reserved: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")
+    # Get memory manager for optimization
+    memory_manager = get_memory_manager()
+    memory_manager.cleanup_memory(force=True)
    
-    # Call function to configure system resources
-    configure_system_resources()
+    # Release Ollama models if they exist to free up VRAM
+    if torch.cuda.is_available() and model_args.use_cuda:
+        release_ollama_models_early()
+    
+    logger.info("Initializing training with memory optimizations")
+    set_seed(training_args.seed)
+    
+    # Apply PyTorch memory optimizations to training arguments
+    logger.info("Applying memory optimizations to training configuration")
+    training_args = memory_manager.optimize_training_args(training_args)
+
+    # --- Accelerate optimizer state offloading logic ---
+    # Enable optimizer state offload to CPU if VRAM is low and not using DeepSpeed
+    vram_total = memory_manager.get_memory_info().get("vram_total_gb", 0)
+    use_accelerate_offload = False
+    if torch.cuda.is_available() and model_args.use_cuda and vram_total > 0 and vram_total < 16:
+        # Only set if not already using DeepSpeed
+        if not hasattr(training_args, "deepspeed") or training_args.deepspeed is None:
+            logger.info("Enabling Hugging Face Accelerate optimizer state offload to CPU for low VRAM GPUs")
+            accelerate_config = {
+                "compute_environment": "LOCAL_MACHINE",
+                "deepspeed_config": None,
+                "distributed_type": "NO",
+                "downcast_bf16": False,
+                "fsdp_config": {},
+                "main_training_function": "main",
+                "mixed_precision": "no",
+                "num_machines": 1,
+                "num_processes": 1,
+                "use_cpu": False,
+                "zero3_init_flag": False,
+                "offload_optimizer_device": "cpu",
+                "offload_param_device": "none"
+            }
+            training_args.accelerate_config = accelerate_config
+            use_accelerate_offload = True
+
+    # Model loading with device_map="auto" for automatic offloading
+    logger.info(f"Loading model with automatic memory management from {model_args.model_name_or_path}")
+    
+    # Create model arguments dict with automatic offloading
+    model_kwargs = {
+        # Don't use "auto" device_map initially to avoid meta tensor issues
+        "device_map": None,
+        "trust_remote_code": True
+    }
+    
+    # Configure quantization if requested
+    if model_args.use_4bit_quantization:
+        from transformers import BitsAndBytesConfig
+        compute_dtype = getattr(torch, model_args.bnb_4bit_compute_dtype)
+        quant_storage_dtype = getattr(torch, model_args.bnb_4bit_quant_storage_dtype)
+        
+        model_kwargs["quantization_config"] = BitsAndBytesConfig(
+            load_in_4bit=model_args.use_4bit_quantization,
+            bnb_4bit_quant_type=model_args.bnb_4bit_quant_type,
+            bnb_4bit_compute_dtype=compute_dtype,
+            bnb_4bit_use_double_quant=model_args.use_nested_quant,
+            bnb_4bit_quant_storage=quant_storage_dtype,
+        )
+        # For 4-bit models, we can use device_map="auto"
+        model_kwargs["device_map"] = "auto"
+        logger.info("Using 4-bit quantization for memory efficiency")
+    elif model_args.use_8bit_quantization:
+        from transformers import BitsAndBytesConfig
+        model_kwargs["quantization_config"] = BitsAndBytesConfig(
+            load_in_8bit=model_args.use_8bit_quantization
+        )
+        # For 8-bit models, we can use device_map="auto"
+        model_kwargs["device_map"] = "auto"
+        logger.info("Using 8-bit quantization for memory efficiency")
+    
+    # Flash attention for memory efficiency when supported
+    if model_args.use_flash_attn and torch.cuda.is_available() and model_args.use_cuda:
+        model_kwargs["attn_implementation"] = "flash_attention_2"
+        logger.info("Using Flash Attention 2 for memory efficiency")
+    
+    # Load model with built-in memory management features
+    model, peft_config, tokenizer = create_and_prepare_model(
+        model_args, data_args, training_args, model_kwargs=model_kwargs
+    )
+    
+    # If model has meta tensors, handle them properly
+    if hasattr(model, "is_meta") and model.is_meta:
+        logger.info("Model has meta tensors, using to_empty() to properly initialize")
+        device = "cuda" if torch.cuda.is_available() and model_args.use_cuda else "cpu"
+        model = model.to_empty(device=device)
+    
+    # Apply gradient checkpointing for memory efficiency
+    if training_args.gradient_checkpointing and hasattr(model, "gradient_checkpointing_enable"):
+        logger.info("Enabling gradient checkpointing for memory efficiency")
+        model.gradient_checkpointing_enable()
+        model.config.use_cache = False
+    
+    # Allow only one full forward/backward pass at a time (if needed for memory)
+    if torch.cuda.is_available() and memory_manager.get_memory_info().get("vram_total_gb", 0) < 8:
+        torch.cuda.set_per_process_memory_fraction(0.9)
+        logger.info("Setting memory fraction limit to avoid OOM errors")

    # datasets
    train_dataset = create_chat_data(
        data_args,
        tokenizer,
    )
-
+    
    response_template = "\n<|im_start|>assistant\n"
-
+    
    collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
    
    training_args.dataset_kwargs = {
@ -243,6 +306,45 @@ def main(model_args, data_args, training_args):
        "add_special_tokens": data_args.add_special_tokens,
    }

+    # Use DeepSpeed to handle meta tensors if available
+    try:
+        # Only configure DeepSpeed if meta tensors are present and DeepSpeed is available
+        if hasattr(model, "is_meta") and model.is_meta:
+            logger.info("Model has meta tensors, checking DeepSpeed availability")
+            # First verify DeepSpeed is properly installed and importable
+            try:
+                import deepspeed
+                logger.info("DeepSpeed is available, configuring for meta tensor handling")
+                
+                # Configure with appropriate settings for meta tensors
+                training_args.deepspeed = {
+                    "zero_stage": 3,
+                    "offload_optimizer": {
+                        "device": "cpu"
+                    },
+                    "offload_param": {
+                        "device": "cpu"
+                    },
+                    "zero3_init_flag": True,
+                    "zero_force_ds_cpu_optimizer": False
+                }
+                logger.info("DeepSpeed configured for meta tensor handling")
+            except ImportError:
+                logger.warning("DeepSpeed is not available, meta tensors will be handled differently")
+                # If DeepSpeed isn't available, use alternative approach to handle meta tensors
+                if torch.cuda.is_available() and model_args.use_cuda:
+                    logger.info("Initializing meta tensors on GPU")
+                    # Use device_map instead of DeepSpeed for meta tensor initialization
+                    from accelerate import init_empty_weights
+                    with init_empty_weights():
+                        model.to_empty(device="cuda")
+                else:
+                    logger.info("Initializing meta tensors on CPU")
+                    model.to_empty(device="cpu")
+    except Exception as e:
+        logger.warning(f"Could not configure meta tensor handling: {e}")
+        logger.warning(traceback.format_exc())
+
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
@ -252,145 +354,46 @@ def main(model_args, data_args, training_args):
        formatting_func=formatting_prompts_func,
        data_collator=collator,
    )
+    
+    # Print model details
    trainer.accelerator.print(f"{trainer.model}")
-    trainer.model.print_trainable_parameters()
-
-    logger.info("start 6")
-    # train
-    checkpoint = None
-    if training_args.resume_from_checkpoint is not None:
-        logger.info("start 6.1")
-        checkpoint = training_args.resume_from_checkpoint
-    logger.info("start 6.2")
-
-    class DebugCallback(transformers.TrainerCallback):
-        """
-        Debug callback to monitor training process
-        """
-
+    
+    if hasattr(trainer.model, "print_trainable_parameters"):
+        trainer.model.print_trainable_parameters()
+    
+    # Memory usage tracking callback
+    class MemoryMonitorCallback(transformers.TrainerCallback):
        def __init__(self):
-            self.step_times = {}
-            self.current_step_start = None
-
-        def on_train_begin(self, args, state, control, **kwargs):
-            logger.info("=== Training Begin ===")
-            logger.info("Checking initial conditions:")
-            trainer = kwargs.get("trainer")
-            if trainer:
-                # Check model status
-                logger.info(f"Model device: {trainer.model.device}")
-                logger.info(f"Model dtype: {next(trainer.model.parameters()).dtype}")
-
-                # Check data loader
-                if hasattr(trainer, "train_dataset"):
-                    logger.info(f"Training dataset size: {len(trainer.train_dataset)}")
-
-                # Check optimizer
-                if hasattr(trainer, "optimizer"):
-                    logger.info("Optimizer configuration:")
-                    for i, group in enumerate(trainer.optimizer.param_groups):
-                        logger.info(
-                            f"Group {i}: lr={group['lr']}, weight_decay={group['weight_decay']}"
-                        )
-
-        def on_step_begin(self, args, state, control, **kwargs):
-            self.current_step_start = time.time()
-            logger.info(f"\n=== Starting Step {state.global_step + 1} ===")
-
-            # Check system status every 10 steps
-            if state.global_step % 10 == 0:
-                process = psutil.Process()
-                with process.oneshot():
-                    logger.info(f"CPU Usage: {process.cpu_percent()}%")
-                    logger.info(
-                        f"Memory Usage: {process.memory_info().rss / 1024**2:.2f}MB"
-                    )
-                    logger.info(f"Thread Count: {process.num_threads()}")
-
+            self.memory_manager = get_memory_manager()
+        
        def on_step_end(self, args, state, control, **kwargs):
-            if self.current_step_start:
-                step_time = time.time() - self.current_step_start
-                self.step_times[state.global_step] = step_time
-                avg_time = sum(self.step_times.values()) / len(self.step_times)
-                logger.info(
-                    f"Step {state.global_step + 1} completed in {step_time:.2f}s (avg: {avg_time:.2f}s)"
-                )
-
-                # Check if step time is much longer than average
-                if step_time > avg_time * 2 and len(self.step_times) > 1:
-                    logger.warning(
-                        f"Step {state.global_step + 1} took {step_time:.2f}s, which is much longer than average!"
-                    )
-
-                trainer = kwargs.get("trainer")
-                if trainer and hasattr(trainer, "optimizer"):
-                    # Check gradient status
-                    grad_norms = []
-                    for name, param in trainer.model.named_parameters():
-                        if param.grad is not None:
-                            grad_norms.append(param.grad.norm().item())
-
-                    if grad_norms:
-                        avg_grad_norm = sum(grad_norms) / len(grad_norms)
-                        logger.info(f"Average gradient norm: {avg_grad_norm:.5f}")
-                    else:
-                        logger.warning("No gradients found in this step!")
-
-        def on_log(self, args, state, control, logs=None, **kwargs):
-            if logs:
-                logger.info(f"=== Logs for Step {state.global_step} ===")
-                for key, value in logs.items():
-                    logger.info(f"{key}: {value}")
-
-        def on_train_end(self, args, state, control, **kwargs):
-            logger.info("=== Training Ended ===")
-            logger.info(f"Total steps completed: {state.global_step}")
-            if self.step_times:
-                avg_time = sum(self.step_times.values()) / len(self.step_times)
-                logger.info(f"Average step time: {avg_time:.2f}s")
+            # Check memory every 5 steps
+            if state.global_step % 5 == 0 and torch.cuda.is_available():
+                info = self.memory_manager.get_memory_info()
+                vram_usage_pct = info.get("vram_used_gb", 0) / info.get("vram_total_gb", 1) * 100
+                
+                if vram_usage_pct > 90:
+                    logger.info(f"VRAM usage high ({vram_usage_pct:.1f}%), cleaning cache")
+                    self.memory_manager.cleanup_memory()
+        
+        def on_save(self, args, state, control, **kwargs):
+            # Free up memory before saving
+            self.memory_manager.cleanup_memory(force=True)

+    # Add memory monitoring
+    trainer.add_callback(MemoryMonitorCallback())
+    
+    # Add existing debug callback
    trainer.add_callback(DebugCallback())

-    # Add more detailed logs
-    logger.info("Starting training preparation...")
+    # Resume from checkpoint if specified
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+
+    # Training with automatic memory management
    try:
-        logger.info("Initializing training process...")
-        # Check model loading and structure
-        logger.info("Analyzing model structure...")
-        model = trainer.model
-
-        def print_model_structure(model, prefix=""):
-            logger.info(f"{prefix}Model class: {model.__class__.__name__}")
-            for name, child in model.named_children():
-                logger.info(f"{prefix}Child: {name} ({child.__class__.__name__})")
-                if len(list(child.named_children())) > 0:
-                    print_model_structure(child, prefix + "  ")
-
-        # print_model_structure(model)
-
-        # Check model size
-        total_params = sum(p.numel() for p in trainer.model.parameters())
-        trainable_params = sum(
-            p.numel() for p in trainer.model.parameters() if p.requires_grad
-        )
-        logger.info(f"Total parameters: {total_params:,}")
-        logger.info(f"Trainable parameters: {trainable_params:,}")
-
-        # Check optimizer settings
-        logger.info("Checking optimizer settings...")
-
-        # Check data loader
-        train_dataloader = trainer.get_train_dataloader()
-        logger.info(f"Train dataloader created with {len(train_dataloader)} batches")
-
-        process = psutil.Process()
-        memory_info = process.memory_info()
-        logger.info(f"Memory usage details:")
-        logger.info(f"RSS (Resident Set Size): {memory_info.rss / 1024**2:.2f}MB")
-        logger.info(f"VMS (Virtual Memory Size): {memory_info.vms / 1024**2:.2f}MB")
-
-        # Start training
-        logger.info("Starting actual training process...")
+        logger.info("Starting training with memory-optimized configuration")
        trainer.train(resume_from_checkpoint=checkpoint)
    except Exception as e:
        logger.error(f"Error during training: {str(e)}")
@ -398,9 +401,13 @@ def main(model_args, data_args, training_args):
        logger.error(f"Traceback: {traceback.format_exc()}")
        raise

-    logger.info("start 7")
+    # Save the model
    if trainer.is_fsdp_enabled:
        trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
+    
+    # Clean up before saving
+    memory_manager.cleanup_memory(force=True)
+    
    trainer.save_model()
    logger.info("Training completed successfully")

--- a/lpm_kernel/L2/train_for_user.sh
+++ b/lpm_kernel/L2/train_for_user.sh
@ -6,6 +6,7 @@ NUM_TRAIN_EPOCHS="3"
 CONCURRENCY_THREADS="2"
 DATA_SYNTHESIS_MODE="low"
 HALF=False
+USE_CUDA=False  # Default to False, will be overridden by parameter
 IS_COT=False

 # Process parameters
@ -15,33 +16,75 @@ while [[ "$#" -gt 0 ]]; do
        --epochs) NUM_TRAIN_EPOCHS="$2"; shift ;;
        --threads) CONCURRENCY_THREADS="$2"; shift ;;
        --mode) DATA_SYNTHESIS_MODE="$2"; shift ;;
+        --cuda) 
+            # Convert string to lowercase for consistent comparison
+            cuda_value=$(echo "$2" | tr '[:upper:]' '[:lower:]')
+            if [[ "$cuda_value" == "true" || "$cuda_value" == "1" || "$cuda_value" == "yes" ]]; then
+                USE_CUDA=True
+                echo "CUDA enabled by user configuration."
+            else
+                USE_CUDA=False
+                echo "CUDA disabled by user configuration."
+            fi
+            shift ;;
        --is_cot) IS_COT="$2"; shift ;;
        *) echo "Unknown parameter: $1"; exit 1 ;;
    esac
    shift
 done

+# Explicitly log the CUDA setting passed from the command line
+echo "CUDA parameter received: $USE_CUDA"
+
+# Verify CUDA availability if enabled
+if [[ "$USE_CUDA" == "True" ]]; then
+    # Set CUDA environment variables to ensure PyTorch detects GPU
+    export CUDA_VISIBLE_DEVICES=0
+    echo "CUDA_VISIBLE_DEVICES set to 0"
+    
+    # Set CUDA_LAUNCH_BLOCKING to 0 for async operations (better performance)
+    export CUDA_LAUNCH_BLOCKING=0
+    echo "CUDA_LAUNCH_BLOCKING set to 0 for better performance"
+else
+    # Explicitly disable CUDA
+    export CUDA_VISIBLE_DEVICES=""
+    echo "CUDA_VISIBLE_DEVICES explicitly disabled"
+fi
+
 # Log the parameters being used
 echo "Using training parameters:"
 echo "  Learning rate: $LEARNING_RATE"
 echo "  Number of epochs: $NUM_TRAIN_EPOCHS"
 echo "  Concurrency threads: $CONCURRENCY_THREADS"
 echo "  Data synthesis mode: $DATA_SYNTHESIS_MODE"
+echo "  Use CUDA: $USE_CUDA"
 echo "  Is chain of thought: $IS_COT"

 # If concurrency threads are set, configure related environment variables
 if [ "$CONCURRENCY_THREADS" != "1" ]; then
+  # Limit the number of parallel threads to avoid memory issues
  export OMP_NUM_THREADS=$CONCURRENCY_THREADS
  export MKL_NUM_THREADS=$CONCURRENCY_THREADS
  export NUMEXPR_NUM_THREADS=$CONCURRENCY_THREADS
+  # Add torch-specific threading controls
+  export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
  echo "Set thread environment variables to $CONCURRENCY_THREADS"
 fi

-# Add BF16 option based on the platform
-if [ "$PLATFORM" != "apple" ]; then
+# Add BF16 option based on the platform and CUDA availability
+if [ "$PLATFORM" != "apple" ] && [ "$USE_CUDA" == "True" ]; then
  HALF=True
+  echo "Enabling BF16 half precision for non-Apple platform with CUDA"
+else
+  echo "Using standard precision (not using BF16)"
 fi

+# Print environment for debugging
+echo "Environment configuration:"
+echo "  CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}"
+echo "  PYTORCH_CUDA_ALLOC_CONF: ${PYTORCH_CUDA_ALLOC_CONF}"
+echo "  Using half precision: ${HALF}"
+
 # Execute training script with parameters from environment variables
 python lpm_kernel/L2/train.py \
  --seed 42 \
@ -70,7 +113,7 @@ python lpm_kernel/L2/train.py \
  --per_device_train_batch_size 2 \
  --gradient_accumulation_steps $CONCURRENCY_THREADS \
  --gradient_checkpointing True \
-  --use_reentrant True \
+  --use_reentrant False \
  --use_peft_lora True \
  --lora_r 8 \
  --lora_alpha 16 \
@ -78,6 +121,7 @@ python lpm_kernel/L2/train.py \
  --lora_target_modules "all-linear" \
  --use_4bit_quantization False \
  --use_nested_quant False \
-  --bnb_4bit_compute_dtype "bfloat16"\
-  --is_cot $IS_COT
+  --bnb_4bit_compute_dtype "bfloat16" \
+  --is_cot $IS_COT \
+  --use_cuda $USE_CUDA

--- a/lpm_kernel/L2/utils.py
+++ b/lpm_kernel/L2/utils.py
@ -33,12 +33,171 @@ from lpm_kernel.L2.training_prompt import (
    MEMORY_COT_PROMPT,
 )

+# Add import for memory manager
+from .memory_manager import get_memory_manager
+import gc
+import requests
+
+# Initialize the logger
+logger = logging.getLogger(__name__)

 # Default chat templates for different model formats
 DEFAULT_CHATML_CHAT_TEMPLATE = "{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}"
 DEFAULT_ZEPHYR_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"


+def release_ollama_models_early():
+    """Release Ollama models from memory as early as possible before model loading.
+    
+    This function uses the Ollama API with keep_alive=0 parameter to properly unload models
+    and free up VRAM before loading the training model.
+    """
+    try:
+        from lpm_kernel.api.services.user_llm_config_service import UserLLMConfigService
+        import json
+        
+        logger.info("Early release of Ollama models to free up VRAM for training")
+        
+        # Get current user LLM config to identify models to release
+        user_llm_config_service = UserLLMConfigService()
+        user_llm_config = user_llm_config_service.get_available_llm()
+        
+        if not user_llm_config:
+            logger.warning("No user LLM configuration found. Skipping Ollama model release.")
+            return
+        
+        # Track which models have been released
+        released_models = set()
+        
+        def get_generate_url(base_endpoint):
+            """Helper function to get the API endpoint for unloading models"""
+            if not base_endpoint:
+                return None
+                
+            base_url = base_endpoint.rstrip("/")
+            
+            # Convert to API base URL if needed (may be v1 format or direct ollama format)
+            if "/v1/" in base_url:
+                api_base = base_url.split("/v1/")[0]
+                return f"{api_base}/api/generate"
+            else:
+                # Check if this is a non-localhost Ollama instance
+                if "ollama" in base_url.lower():
+                    if "localhost" in base_url or "127.0.0.1" in base_url:
+                        return "http://localhost:11434/api/generate"
+                    else:
+                        # Extract the base URL and use it
+                        parts = base_url.split("//")
+                        if len(parts) > 1:
+                            host = parts[1].split("/")[0]
+                            return f"{parts[0]}//{host}/api/generate"
+                    
+                # Default ollama endpoint as fallback
+                return "http://localhost:11434/api/generate"
+        
+        # Release chat model if using Ollama
+        if "ollama" in user_llm_config.chat_endpoint.lower() and user_llm_config.chat_model_name:
+            chat_model = user_llm_config.chat_model_name
+            generate_url = get_generate_url(user_llm_config.chat_endpoint)
+            
+            if not generate_url:
+                logger.warning(f"Could not determine API endpoint for chat model: {chat_model}")
+            else:
+                logger.info(f"Releasing Ollama chat model: {chat_model} via {generate_url}")
+                
+                try:
+                    # Set up headers with API key if provided
+                    headers = {
+                        "Content-Type": "application/json"
+                    }
+                    if user_llm_config.chat_api_key:
+                        headers["Authorization"] = f"Bearer {user_llm_config.chat_api_key}"
+                    
+                    # Use the proper generate endpoint with keep_alive=0 to unload
+                    payload = {
+                        "model": chat_model,
+                        "keep_alive": 0,
+                        "prompt": " "  # Minimal prompt needed for request
+                    }
+                    
+                    unload_response = requests.post(
+                        generate_url,
+                        headers=headers,
+                        data=json.dumps(payload),
+                        timeout=30  # Add timeout to prevent hanging
+                    )
+                    
+                    if unload_response.status_code < 300:
+                        logger.info(f"✅ Successfully unloaded chat model: {chat_model}")
+                        released_models.add(chat_model)
+                    else:
+                        logger.warning(f"Failed to unload model via API: {unload_response.status_code} - {unload_response.text}")
+                except Exception as e:
+                    logger.warning(f"Failed to release chat model {chat_model}: {str(e)}")
+        
+        # Release embedding model if different from chat model and using Ollama
+        if (user_llm_config.embedding_model_name and 
+            "ollama" in user_llm_config.embedding_endpoint.lower() and
+            user_llm_config.embedding_model_name != user_llm_config.chat_model_name and
+            user_llm_config.embedding_model_name not in released_models):
+            
+            embedding_model = user_llm_config.embedding_model_name
+            generate_url = get_generate_url(user_llm_config.embedding_endpoint)
+            
+            if not generate_url:
+                logger.warning(f"Could not determine API endpoint for embedding model: {embedding_model}")
+            else:
+                logger.info(f"Releasing Ollama embedding model: {embedding_model} via {generate_url}")
+                
+                try:
+                    # Set up headers with API key if provided
+                    headers = {
+                        "Content-Type": "application/json"
+                    }
+                    if user_llm_config.embedding_api_key:
+                        headers["Authorization"] = f"Bearer {user_llm_config.embedding_api_key}"
+                    
+                    # Use the proper generate endpoint with keep_alive=0 to unload
+                    payload = {
+                        "model": embedding_model,
+                        "keep_alive": 0,
+                        "prompt": " "  # Minimal prompt needed for request
+                    }
+                    
+                    unload_response = requests.post(
+                        generate_url,
+                        headers=headers,
+                        data=json.dumps(payload),
+                        timeout=30  # Add timeout to prevent hanging
+                    )
+                    
+                    if unload_response.status_code < 300:
+                        logger.info(f"✅ Successfully unloaded embedding model: {embedding_model}")
+                        released_models.add(embedding_model)
+                    else:
+                        logger.warning(f"Failed to unload model via API: {unload_response.status_code} - {unload_response.text}")
+                except Exception as e:
+                    logger.warning(f"Failed to release embedding model {embedding_model}: {str(e)}")
+        
+        # Final cleanup and verification
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            memory_info = get_memory_manager().get_memory_info()
+            vram_used = memory_info.get('vram_used_gb', 0)
+            vram_total = memory_info.get('vram_total_gb', 1)
+            logger.info(f"VRAM after early model release: {vram_used:.2f}GB / {vram_total:.2f}GB ({vram_used/vram_total*100:.1f}%)")
+        
+        if released_models:
+            logger.info(f"Early release completed for {len(released_models)} Ollama models: {', '.join(released_models)}")
+        else:
+            logger.info("No Ollama models were released early")
+    
+    except Exception as e:
+        import traceback
+        logger.error(f"Error in early Ollama model release: {str(e)}")
+        logger.error(traceback.format_exc())
+
+
 def count_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
    """Returns the number of tokens in a text string using a specified encoding.

@ -105,21 +264,31 @@ class ZephyrSpecialTokens(str, Enum):
        """Returns a list of all special tokens."""
        return [token.value for token in cls]

-def create_and_prepare_model(args, data_args, training_args):
+def create_and_prepare_model(args, data_args, training_args, model_kwargs=None):
    """Creates and prepares a model for training.
    
    Args:
        args: Model arguments containing model configuration.
        data_args: Data arguments for training.
        training_args: Training configuration arguments.
+        model_kwargs: Additional kwargs to pass to the model loading function.
        
    Returns:
-        Tuple of (model, tokenizer) ready for training.
+        Tuple of (model, tokenizer, peft_config) ready for training.
    """
+    # Get the memory manager for adaptive loading
+    memory_manager = get_memory_manager()
+    model_kwargs = model_kwargs or {}
+    
+    # Release Ollama models early before we load any models
+    if torch.cuda.is_available() and args.use_cuda:
+        release_ollama_models_early()
+        # Force cleanup memory after releasing Ollama models
+        memory_manager.cleanup_memory(force=True)
+    
    if args.use_unsloth:
        from unsloth import FastLanguageModel
    bnb_config = None
-    quant_storage_dtype = None

    if (
        torch.distributed.is_available()
@ -129,55 +298,142 @@ def create_and_prepare_model(args, data_args, training_args):
    ):
        raise NotImplementedError("Unsloth is not supported in distributed training")

-    if args.use_4bit_quantization:
-        compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
-        quant_storage_dtype = getattr(torch, args.bnb_4bit_quant_storage_dtype)
+    # Clean up memory before loading model
+    memory_manager.cleanup_memory()
+    
+    # Check for CUDA availability and use it if enabled
+    cuda_available = torch.cuda.is_available()
+    use_cuda_requested = args.use_cuda
+    device = "cpu"

-        bnb_config = BitsAndBytesConfig(
-            load_in_4bit=args.use_4bit_quantization,
-            bnb_4bit_quant_type=args.bnb_4bit_quant_type,
-            bnb_4bit_compute_dtype=compute_dtype,
-            bnb_4bit_use_double_quant=args.use_nested_quant,
-            bnb_4bit_quant_storage=quant_storage_dtype,
-        )
+    # Always enable memory-adaptive loading by default (device_map="auto"), unless CUDA is off
+    if cuda_available and use_cuda_requested:
+        device = "cuda"
+        model_kwargs["device_map"] = "auto"
+    else:
+        if use_cuda_requested and not cuda_available:
+            logger.warning("⚠️ CUDA was requested but is not available on this system. Falling back to CPU.")
+        elif cuda_available and not use_cuda_requested:
+            logger.info("ℹ️ CUDA is available but not requested. Using CPU as specified.")
+        else:
+            logger.info("ℹ️ CUDA is not available. Using CPU for training.")
+        # Explicitly remove device_map to force CPU-only
+        if "device_map" in model_kwargs:
+            model_kwargs.pop("device_map")
+        logger.info("Using CPU for model training and inference.")

-        if compute_dtype == torch.float16 and args.use_4bit_quantization:
-            major, _ = torch.cuda.get_device_capability()
-            if major >= 8:
-                logging.info("=" * 80)
-                logging.info(
-                    "Your GPU supports bfloat16, you can accelerate training with the argument --bf16"
-                )
-                logging.info("=" * 80)
+    # Configure quantization based on available memory
+    # Use model_kwargs quantization_config if provided, otherwise build it
+    if "quantization_config" not in model_kwargs:
+        if args.use_4bit_quantization:
+            compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
+            quant_storage_dtype = getattr(torch, args.bnb_4bit_quant_storage_dtype)
+
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=args.use_4bit_quantization,
+                bnb_4bit_quant_type=args.bnb_4bit_quant_type,
+                bnb_4bit_compute_dtype=compute_dtype,
+                bnb_4bit_use_double_quant=args.use_nested_quant,
+                bnb_4bit_quant_storage=quant_storage_dtype,
+            )
+            model_kwargs["quantization_config"] = bnb_config
+
+            if compute_dtype == torch.float16 and args.use_4bit_quantization:
+                major, _ = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0)
+                if major >= 8:
+                    logger.info("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
        elif args.use_8bit_quantization:
            bnb_config = BitsAndBytesConfig(load_in_8bit=args.use_8bit_quantization)
+            model_kwargs["quantization_config"] = bnb_config

-    if args.use_unsloth:
-        # Load model
-        model, _ = FastLanguageModel.from_pretrained(
-            model_name=args.model_name_or_path,
-            max_seq_length=data_args.max_seq_length,
-            dtype=None,
-            load_in_4bit=args.use_4bit_quantization,
-        )
-    else:
-        if os.getenv("PLATFORM") != "apple":
-            model = AutoModelForCausalLM.from_pretrained(
-                args.model_name_or_path,
-                quantization_config=bnb_config,
-                trust_remote_code=True,
-                attn_implementation="flash_attention_2" if args.use_flash_attn else "eager",
-                torch_dtype=torch.bfloat16
-            )
+    # Load model with memory-adaptive approach
+    model = None
+    tokenizer = None
+    peft_config = None
+    
+    try:
+        # First try loading the model with the requested configuration
+        if args.use_unsloth:
+            # Load model with Unsloth using memory manager
+            unsloth_kwargs = {
+                "model_name": args.model_name_or_path,
+                "max_seq_length": data_args.max_seq_length,
+                "dtype": None,
+                "load_in_4bit": args.use_4bit_quantization,
+                "load_in_8bit": args.use_8bit_quantization,
+                "trust_remote_code": True,
+                "device_map": model_kwargs.get("device_map", "auto") if args.use_cuda and torch.cuda.is_available() else None,
+            }
+            
+            logger.info(f"Loading model with Unsloth with parameters: {unsloth_kwargs}")
+            model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs)
+            
        else:
+            # Load model with standard approach
+            load_kwargs = {
+                "trust_remote_code": True,
+            }
+            
+            # Use any provided model_kwargs
+            load_kwargs.update(model_kwargs)
+            
+            if "attn_implementation" not in load_kwargs and args.use_flash_attn:
+                load_kwargs["attn_implementation"] = "flash_attention_2"
+            
+            # Set default device_map if not specified
+            if "device_map" not in load_kwargs and args.use_cuda and torch.cuda.is_available():
+                load_kwargs["device_map"] = "auto"
+                            
+            logger.info(f"Loading model with parameters: {load_kwargs}")
+            model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, **load_kwargs)
+    
+    except (RuntimeError, torch.cuda.OutOfMemoryError, MemoryError) as e:
+        # If standard approaches fail, try progressive fallbacks
+        logger.warning(f"Failed to load model with standard settings: {str(e)}")
+        logger.info("Falling back to adaptive model loading...")
+        
+        # First cleanup to ensure maximum memory available
+        memory_manager.cleanup_memory(force=True)
+        
+        try:
+            # Try with simpler configuration - float16 instead of bfloat16
+            logger.info("Attempting to load with float16 precision...")
            model = AutoModelForCausalLM.from_pretrained(
                args.model_name_or_path,
-                quantization_config=bnb_config,
+                device_map="auto" if torch.cuda.is_available() and args.use_cuda else None,
+                torch_dtype=torch.float16 if torch.cuda.is_available() and args.use_cuda else None,
                trust_remote_code=True
            )
+        except (RuntimeError, torch.cuda.OutOfMemoryError, MemoryError) as e:
+            # If that fails too, try even more conservative loading
+            logger.warning(f"Float16 loading failed: {str(e)}")
+            memory_manager.cleanup_memory(force=True)
+            
+            try:
+                # Try with CPU offloading and gradual loading
+                logger.info("Attempting most conservative loading with CPU offloading...")
+                model = AutoModelForCausalLM.from_pretrained(
+                    args.model_name_or_path,
+                    device_map="auto",
+                    offload_folder="offload_folder",
+                    offload_state_dict=True,
+                    torch_dtype=torch.float16 if torch.cuda.is_available() else None,
+                    trust_remote_code=True,
+                    low_cpu_mem_usage=True
+                )
+            except Exception as e:
+                # If all fallbacks fail, it's a fatal error
+                logger.error(f"All adaptive loading approaches failed: {str(e)}")
+                raise RuntimeError(f"Failed to load model with any memory adaptation technique: {str(e)}")

-    peft_config = None
-    chat_template = None
+    # If still not loaded, it's a fatal error
+    if model is None:
+        raise RuntimeError("Failed to load model with any memory adaptation technique")
+
+    # Apply memory optimization to model
+    model = memory_manager.optimize_model_for_training(model)
+
+    # Configure LoRA if requested
    if args.use_peft_lora and not args.use_unsloth:
        peft_config = LoraConfig(
            lora_alpha=args.lora_alpha,
@ -190,6 +446,7 @@ def create_and_prepare_model(args, data_args, training_args):
            else args.lora_target_modules,
        )

+    # Load tokenizer - tokenizers are usually small and don't need memory management
    special_tokens = None
    chat_template = None
    if args.chat_template_format == "chatml":
@ -214,22 +471,43 @@ def create_and_prepare_model(args, data_args, training_args):
        tokenizer = AutoTokenizer.from_pretrained(
            args.model_name_or_path, trust_remote_code=True
        )
-        tokenizer.pad_token = tokenizer.eos_token
+        # Make sure pad_token is set
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token

+    # Apply Unsloth LoRA if requested and check memory status
    if args.use_unsloth:
-        # Do model patching and add fast LoRA weights
-        model = FastLanguageModel.get_peft_model(
-            model,
-            lora_alpha=args.lora_alpha,
-            lora_dropout=args.lora_dropout,
-            r=args.lora_r,
-            target_modules=args.lora_target_modules.split(",")
-            if args.lora_target_modules != "all-linear"
-            else args.lora_target_modules,
-            use_gradient_checkpointing=training_args.gradient_checkpointing,
-            random_state=training_args.seed,
-            max_seq_length=data_args.max_seq_length,
-        )
+        try:
+            # Clean up first
+            memory_manager.cleanup_memory()
+            
+            # Apply LoRA with memory monitoring
+            model = FastLanguageModel.get_peft_model(
+                model,
+                lora_alpha=args.lora_alpha,
+                lora_dropout=args.lora_dropout,
+                r=args.lora_r,
+                target_modules=args.lora_target_modules.split(",")
+                if args.lora_target_modules != "all-linear"
+                else args.lora_target_modules,
+                use_gradient_checkpointing=training_args.gradient_checkpointing,
+                random_state=training_args.seed,
+                max_seq_length=data_args.max_seq_length,
+            )
+            
+        except Exception as e:
+            logger.error(f"Failed to apply Unsloth LoRA: {str(e)}")
+            # If Unsloth fails, we might need to fall back to standard training
+            if args.use_cuda and torch.cuda.is_available():
+                logger.warning("Low VRAM detected, moving model to CPU")
+                model = model.cpu()
+                torch.cuda.empty_cache()
+
+    # Final memory status check
+    memory_info = memory_manager.get_memory_info()
+    logger.info(f"Memory after model preparation: RAM: {memory_info['ram_used_gb']:.2f}GB / {memory_info['ram_total_gb']:.2f}GB")
+    if torch.cuda.is_available():
+        logger.info(f"VRAM: {memory_info.get('vram_used_gb', 0):.2f}GB / {memory_info.get('vram_total_gb', 0):.2f}GB")

    return model, peft_config, tokenizer

@ -343,11 +621,11 @@ def setup_logger(log_path, logger_name="download_logger"):
    return logger


-def save_hf_model(model_name="Qwen2.5-0.5B-Instruct", log_file_path=None) -> str:
+def save_hf_model(model_name=None, log_file_path=None) -> str:
    """Saves a Hugging Face model locally.
    
    Args:
-        model_name: Name of the model to save. Defaults to "Qwen2.5-0.5B-Instruct".
+        model_name: Name of the model to save. If None, will attempt to get from config.
        log_file_path: Path to save download logs. If None, uses default path.
        
    Returns:
@ -360,25 +638,39 @@ def save_hf_model(model_name="Qwen2.5-0.5B-Instruct", log_file_path=None) -> str
    # Setup logging
    logger = setup_logger(log_file_path)
    
+    # If no model name provided, attempt to get from training configuration
+    if not model_name:
+        try:
+            from lpm_kernel.configs.config import Config
+            config = Config()
+            model_name = config.get("training", {}).get("model_name")
+            if not model_name:
+                logger.warning("No model name provided and none found in config. Using Qwen2.5-0.5B-Instruct as fallback.")
+                model_name = "Qwen2.5-0.5B-Instruct"
+        except Exception as e:
+            logger.warning(f"Failed to get model name from config: {str(e)}. Using Qwen2.5-0.5B-Instruct as fallback.")
+            model_name = "Qwen2.5-0.5B-Instruct"
+    
    base_dir = os.path.join(os.getcwd(), "resources/L2/base_models")
+    # Normalize model name and check for path traversal attempts
    normalized_model_name = os.path.normpath(model_name)
    if ".." in normalized_model_name or normalized_model_name.startswith("/"):
-        raise ValueError("Invalid model name")
+        raise ValueError("Invalid model name - potential path traversal attempt")
+    
+    # Prepare save path
    save_path = os.path.join(base_dir, normalized_model_name)
    os.makedirs(save_path, exist_ok=True)

-    from huggingface_hub import list_repo_files, configure_http_backend
-    import requests
+    from huggingface_hub import list_repo_files, configure_http_backend, hf_hub_download
    from tqdm import tqdm
-    from tqdm.contrib.concurrent import thread_map
-    import shutil
+    from concurrent.futures import ThreadPoolExecutor
+    import traceback
    
-    # Set a higher timeout, but remove the unsupported pool_size parameter
+    # Configure HTTP backend more simply
    try:
-        # Try using the timeout parameter to configure
        configure_http_backend(timeout=100.0)
-    except TypeError:
-        # If the timeout parameter is also not supported, do not use any parameters
+    except Exception as e:
+        logger.warning(f"Failed to configure HTTP backend with timeout: {e}")
        try:
            configure_http_backend()
        except Exception as e:
@ -391,31 +683,31 @@ def save_hf_model(model_name="Qwen2.5-0.5B-Instruct", log_file_path=None) -> str
    hf_model_name = f"Qwen/{model_name}"
    
    try:
-        # First get the list of all files in the repository
+        # Get list of files to download
        files = list_repo_files(hf_model_name)
        logger.info(f"Found {len(files)} files to download from {hf_model_name}")
-        
-        # Define a function for downloading a single file and recording progress
-        def download_file_with_progress(file_info):
-            filename, file_path = file_info
+
+        def download_file_with_progress(filename):
+            """Download a single file from the model repository"""
+            local_file_path = os.path.join(save_path, filename)
+            
+            # Create directories if they don't exist
+            os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
+            
+            # Check if file already exists and is not empty
+            if os.path.exists(local_file_path) and os.path.getsize(local_file_path) > 0:
+                logger.info(f"File already exists: {filename}")
+                return True
+            
            try:
                # Build the download URL
                url = f"https://huggingface.co/{hf_model_name}/resolve/main/{filename}"
                
-                # Target file path
-                local_file_path = os.path.join(save_path, filename)
-                os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
-                
-                # Check if the file already exists
-                if os.path.exists(local_file_path):
-                    logger.info(f"File already exists: {filename}")
-                    return filename, True
-                
                # Get file size
                response = requests.head(url)
                total_size = int(response.headers.get('content-length', 0))
                
-                # If the size cannot be obtained, set a default value or do not display the percentage
+                # If the size cannot be obtained, set a default value
                if total_size == 0:
                    logger.info(f"Starting download of file: {filename} (Size unknown)")
                else:
@ -423,7 +715,7 @@ def save_hf_model(model_name="Qwen2.5-0.5B-Instruct", log_file_path=None) -> str
                
                # Create the file to write to
                with open(local_file_path, 'wb') as f:
-                    # Create a progress bar, if the total size is unknown, set it to None
+                    # Create a progress bar
                    progress_bar = tqdm(
                        total=total_size if total_size > 0 else None,
                        unit='iB',
@ -432,84 +724,87 @@ def save_hf_model(model_name="Qwen2.5-0.5B-Instruct", log_file_path=None) -> str
                        disable=False
                    )
                    
-                    # Define the progress callback function
+                    # Define progress callback
                    def progress_callback(current, total):
-                        # Update the progress bar
                        progress_bar.update(current - progress_bar.n)
                        
-                        # Record the log every 1MB (or a value close to 1MB)
-                        if current % (1024 * 1024) < 8192:  # Record every 1MB
-                            # Ensure total is greater than 0 before calculating the percentage
-                            if total and total > 0:  # Use and to ensure total is not None and greater than 0
+                        # Log progress every ~1MB
+                        if current % (1024 * 1024) < 8192:
+                            if total and total > 0:
                                percent = current / total * 100
                                logger.info(f"File {filename}: Downloaded {current/1024/1024:.2f} MB / {total/1024/1024:.2f} MB ({percent:.2f}%)")
                            else:
-                                # If the total size is unknown or 0, only show the downloaded size
                                logger.info(f"File {filename}: Downloaded {current/1024/1024:.2f} MB (total size unknown)")
                
-                    # Use the request library to download the file and update the progress
+                    # Download file with progress tracking
                    response = requests.get(url, stream=True)
                    if response.status_code == 200:
                        downloaded = 0
                        
-                        # Check if the response contains the Content-Length header information
+                        # Update total size if needed
                        actual_total = int(response.headers.get('content-length', 0))
                        if actual_total > 0 and (total_size == 0 or total_size != actual_total):
-                            # If the HEAD request did not return the correct size, but the GET request did, then update the total size
                            total_size = actual_total
                            logger.info(f"Updated file size for {filename}: {total_size / 1024 / 1024:.2f} MB")
                            progress_bar.total = total_size
                            progress_bar.refresh()
                        
                        for chunk in response.iter_content(chunk_size=8192):
-                            if chunk:  # Filter out empty chunks that keep the connection alive
+                            if chunk:
                                f.write(chunk)
                                downloaded += len(chunk)
                                progress_callback(downloaded, total_size)
                                
                        progress_bar.close()
                        logger.info(f"Completed download of file: {filename}")
-                        return filename, True
+                        return True
                    else:
                        logger.error(f"Failed to download {filename}: HTTP status {response.status_code}")
-                        return filename, False
+                        failed_files.append(filename)
+                        return False
                        
            except Exception as e:
-                logger.error(f"Error downloading {filename}: {str(e)}")
-                return filename, False
+                logger.error(f"Failed to download {filename}: {str(e)}")
+                failed_files.append(filename)
+                return False
+
+        # Keep track of failed files for potential retry
+        failed_files = []
        
-        # Create a list of file path information
-        file_infos = [(filename, os.path.join(save_path, filename)) for filename in files]
-        
-        # Use a thread pool to download all files in parallel
-        logger.info(f"Starting parallel download of {len(file_infos)} files")
-        
-        # Use a thread pool to download all files in parallel
-        from concurrent.futures import ThreadPoolExecutor
-        
-        # Limit the number of concurrent requests to avoid too many requests
-        max_workers = min(10, len(file_infos))
-        results = []
+        # Use ThreadPoolExecutor for parallel downloads with controlled concurrency
+        max_workers = min(8, len(files))  # Limit concurrent downloads to avoid overloading
+        successful_downloads = 0
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            # Submit all download tasks
-            future_to_file = {executor.submit(download_file_with_progress, file_info): file_info[0] 
-                             for file_info in file_infos}
-            
-            # Wait for all tasks to complete and collect results
-            for future in tqdm(future_to_file, desc="Overall Progress", unit="file"):
-                filename = future_to_file[future]
-                try:
+            # Create a progress bar for overall download progress
+            with tqdm(total=len(files), desc="Downloading model files") as progress:
+                futures = [executor.submit(download_file_with_progress, file) for file in files]
+                
+                for future in futures:
                    result = future.result()
-                    results.append(result)
-                    logger.info(f"Finished processing {filename}")
-                except Exception as exc:
-                    logger.error(f'{filename} generated an exception: {exc}')
-                    results.append((filename, False))
+                    if result:
+                        successful_downloads += 1
+                    progress.update(1)
+                    
+                    # Report progress periodically
+                    if progress.n % 5 == 0 or progress.n == len(files):
+                        logger.info(f"Downloaded {progress.n}/{len(files)} files ({successful_downloads} successful)")
        
-        # Check the download results
-        success_count = sum(1 for _, success in results if success)
-        logger.info(f"Download completed. Successfully downloaded {success_count}/{len(files)} files.")
+        # Handle any failed downloads
+        if failed_files:
+            logger.warning(f"Failed to download {len(failed_files)} files. First few: {failed_files[:5]}")
+            
+            # If most files failed, there might be an issue with the model repository
+            if len(failed_files) > len(files) * 0.5:
+                logger.error(f"More than 50% of files failed to download. There might be an issue with the model repository.")
+                raise RuntimeError("Too many files failed to download")
+            
+            # If critical files failed (like model weights or config), warn specifically
+            critical_patterns = ['model.safetensors', 'config.json', 'tokenizer.json']
+            critical_failed = [f for f in failed_files if any(pattern in f for pattern in critical_patterns)]
+            if critical_failed:
+                logger.error(f"Failed to download critical files: {critical_failed}")
+                raise RuntimeError(f"Failed to download critical model files: {', '.join(critical_failed)}")
        
        # Record the download completion information
        try:
@ -527,15 +822,14 @@ def save_hf_model(model_name="Qwen2.5-0.5B-Instruct", log_file_path=None) -> str
            raise
    except KeyboardInterrupt:
        logger.warning(f"Download interrupted by user for model: {model_name}")
+        # Clean up partial downloads
        raise
    except Exception as e:
-        # Log any errors that occur
        logger.error(f"Error downloading model: {str(e)}")
+        logger.error(traceback.format_exc())
        raise
-    
    return save_path

-
 def format_timestr(utc_time_str):
    """Formats a UTC time string to a more readable format.
    
--- a/lpm_kernel/api/domains/documents/routes.py
+++ b/lpm_kernel/api/domains/documents/routes.py
@ -349,3 +349,56 @@ def get_document_embedding(document_id: int):
        return jsonify(
            APIResponse.error(message=f"Error getting document embedding: {str(e)}")
        )
+
+
+@document_bp.route("/documents/verify-embeddings", methods=["GET"])
+def verify_document_embeddings():
+    """Verify all document embeddings and return statistics"""
+    try:
+        verbose = request.args.get("verbose", "").lower() == "true"
+        results = document_service.verify_document_embeddings(verbose=verbose)
+        return jsonify(APIResponse.success(data=results))
+
+    except Exception as e:
+        logger.error(f"Error verifying document embeddings: {str(e)}", exc_info=True)
+        return jsonify(APIResponse.error(message=f"Error verifying document embeddings: {str(e)}"))
+
+
+@document_bp.route("/documents/repair", methods=["POST"])
+def repair_documents():
+    """Repair documents with missing analysis and embeddings"""
+    try:
+        # First, fix missing document analysis (summaries and insights)
+        fixed_analysis_count = document_service.fix_missing_document_analysis()
+        
+        # Get verification results after fixing analysis
+        verification_results = document_service.verify_document_embeddings(verbose=False)
+        
+        # Process documents with missing embeddings
+        documents_fixed = 0
+        for doc in document_service._repository.list():
+            embedding = document_service.get_document_embedding(doc.id)
+            if doc.raw_content and embedding is None:
+                try:
+                    document_service.process_document_embedding(doc.id)
+                    # Also process chunk embeddings
+                    document_service.generate_document_chunk_embeddings(doc.id)
+                    documents_fixed += 1
+                except Exception as e:
+                    logger.error(f"Error processing document {doc.id} embedding: {str(e)}")
+        
+        # Get final verification results
+        final_results = document_service.verify_document_embeddings(verbose=False)
+        
+        return jsonify(APIResponse.success(
+            data={
+                "analysis_fixed": fixed_analysis_count,
+                "embeddings_fixed": documents_fixed,
+                "initial_state": verification_results,
+                "final_state": final_results
+            }
+        ))
+
+    except Exception as e:
+        logger.error(f"Error repairing documents: {str(e)}", exc_info=True)
+        return jsonify(APIResponse.error(message=f"Error repairing documents: {str(e)}"))
--- a/lpm_kernel/api/domains/kernel2/routes_l2.py
+++ b/lpm_kernel/api/domains/kernel2/routes_l2.py
@ -3,6 +3,8 @@ import logging
 import os
 import time
 import sys
+import torch  # Add torch import for CUDA detection
+import traceback
 from dataclasses import asdict

 from flask import Blueprint, jsonify, Response, request
@ -352,6 +354,19 @@ def train2():
        model_name = data["model_name"]
        paths = get_model_paths(model_name)

+        # Get optional parameters with defaults
+        learning_rate = data.get("learning_rate", 2e-4)
+        num_train_epochs = data.get("number_of_epochs", 3)
+        concurrency_threads = data.get("concurrency_threads", 2)
+        data_synthesis_mode = data.get("data_synthesis_mode", "low")
+        use_cuda = data.get("use_cuda", False)
+        
+        # Convert use_cuda to string "True" or "False" for the shell script
+        use_cuda_str = "True" if use_cuda else "False"
+        
+        logger.info(f"Training configuration: learning_rate={learning_rate}, epochs={num_train_epochs}, "
+                   f"threads={concurrency_threads}, mode={data_synthesis_mode}, use_cuda={use_cuda} ({use_cuda_str})")
+
        # Check if model exists
        if not os.path.exists(paths["base_path"]):
            return jsonify(APIResponse.error(
@ -385,29 +400,61 @@ def train2():

        script_path = os.path.join(os.getcwd(), "lpm_kernel/L2/train_for_user.sh")

+        # Build command arguments
+        cmd_args = [
+            "--lr", str(learning_rate),
+            "--epochs", str(num_train_epochs),
+            "--threads", str(concurrency_threads),
+            "--mode", str(data_synthesis_mode),
+            "--cuda", use_cuda_str  # Use the properly formatted string
+        ]
+
        # Start training
        import threading
        _training_thread = threading.Thread(
-            target=start_training,
-            args=(script_path, log_path),
+            target=start_training_with_args,
+            args=(script_path, log_path, cmd_args),
            daemon=True
        )
        _training_thread.start()

        return jsonify(APIResponse.success(
            data={
+                "status": "training_started",
                "model_name": model_name,
                "log_path": log_path,
-                "personal_dir": paths["personal_dir"],
-                "merged_dir": paths["merged_dir"]
            },
-            message="Training task started"
+            message="Training task started successfully"
        ))

    except Exception as e:
-        error_msg = f"Failed to start training: {str(e)}"
-        logger.error(error_msg)
-        return jsonify(APIResponse.error(message=error_msg, code=500))
+        logger.error(f"Error starting training task: {str(e)}")
+        traceback.print_exc()
+        return jsonify(APIResponse.error(message=f"Failed to start training: {str(e)}"))
+
+
+def start_training_with_args(script_path: str, log_path: str, args: list) -> None:
+    """Start training with additional arguments"""
+    global _training_process
+    try:
+        # Convert script path and args to a command
+        cmd = [script_path] + args
+        
+        # Use ScriptRunner to execute the script
+        runner = ScriptRunner(log_path=log_path)
+        _training_process = runner.execute_script(
+            script_path=script_path,
+            script_type="training",
+            is_python=False,  # This is a bash script
+            args=args
+        )
+
+        logger.info(f"Training process started with args: {args}, process: {_training_process}")
+
+    except Exception as e:
+        logger.error(f"Failed to start training process: {str(e)}")
+        _training_process = None
+        raise


@kernel2_bp.route("/merge_weights", methods=["POST"])
@ -550,6 +597,9 @@ def start_llama_server():
            return jsonify(APIResponse.error(message="Missing required parameter: model_name", code=400))

        model_name = data["model_name"]
+        # Get optional use_gpu parameter with default value of True
+        use_gpu = data.get("use_gpu", True)
+        
        paths = get_model_paths(model_name)
        gguf_path = os.path.join(paths["gguf_dir"], "model.gguf")

@ -564,53 +614,34 @@ def start_llama_server():
            server_executable = "llama-server"
        server_path = os.path.join(server_path, server_executable)

-        # Check if service and model file exist
-        if not os.path.exists(server_path):
-            return jsonify(APIResponse.error(message="llama-server executable file does not exist", code=400))
+        # Check if model file exists
        if not os.path.exists(gguf_path):
            return jsonify(APIResponse.error(
                message=f"Model '{model_name}' GGUF file does not exist, please convert model first",
                code=400
            ))

-        # Check if service is already running
+        # Start the server using the LocalLLMService with GPU acceleration if requested
+        success = local_llm_service.start_server(gguf_path, use_gpu=use_gpu)
+        
+        if not success:
+            return jsonify(APIResponse.error(message="Failed to start llama-server", code=500))
+            
+        # Get updated service status
        status = local_llm_service.get_server_status()
-        if status.is_running:
-            return jsonify(
-                APIResponse.error(
-                    message=f"llama-server is already running, PID: {status.process_info.pid}",
-                    code=400
-                )
-            )
-
-        # Build parameters
-        args = [server_path, "-m", gguf_path, "--port", "8080"]
-
-        # Use thread to start service asynchronously
-        def start_server():
-            script_executor.execute(
-                script_path=server_path,
-                script_type="llama_server",
-                args=args[1:],  # Remove first parameter (executable file path)
-                shell=False,
-            )
-
-        # Start new thread to run service
-        from threading import Thread
-
-        thread = Thread(target=start_server)
-        thread.daemon = True
-        thread.start()
-
-        # Return start status immediately
+        
+        # Return success response with GPU info
+        gpu_info = "with GPU acceleration" if use_gpu and torch.cuda.is_available() else "with CPU only"
        return jsonify(
            APIResponse.success(
                data={
                    "model_name": model_name,
                    "gguf_path": gguf_path,
-                    "status": "starting"
+                    "status": "running" if status.is_running else "starting",
+                    "use_gpu": use_gpu and torch.cuda.is_available(),
+                    "gpu_info": gpu_info
                },
-                message="llama-server service is starting"
+                message=f"llama-server service started {gpu_info}"
            )
        )

@ -805,3 +836,31 @@ def chat(body: ChatRequest):
        if not getattr(body, 'stream', True):  # Default to stream if attribute missing
            return jsonify(error_response), 500
        return local_llm_service.handle_stream_response(iter([error_response]))
+
+
+@kernel2_bp.route("/cuda/available", methods=["GET"])
+def check_cuda_available():
+    """Check if CUDA is available for model training/inference"""
+    try:
+        import torch
+        cuda_available = torch.cuda.is_available()
+        cuda_info = {}
+        
+        if cuda_available:
+            cuda_info = {
+                "device_count": torch.cuda.device_count(),
+                "current_device": torch.cuda.current_device(),
+                "device_name": torch.cuda.get_device_name(0)
+            }
+        
+        return jsonify(APIResponse.success(
+            data={
+                "cuda_available": cuda_available,
+                "cuda_info": cuda_info
+            },
+            message="CUDA availability check completed"
+        ))
+    except Exception as e:
+        error_msg = f"Error checking CUDA availability: {str(e)}"
+        logger.error(error_msg)
+        return jsonify(APIResponse.error(message=error_msg, code=500))
--- a/lpm_kernel/api/domains/trainprocess/routes.py
+++ b/lpm_kernel/api/domains/trainprocess/routes.py
@ -24,6 +24,7 @@ def start_process():
        number_of_epochs: Number of training epochs (optional)
        concurrency_threads: Number of threads for concurrent processing (optional)
        data_synthesis_mode: Mode for data synthesis (optional)
+        use_cuda: Whether to use CUDA for training (optional)
    
    Includes the following steps:
    1. Health check
@ -63,6 +64,7 @@ def start_process():
        number_of_epochs = data.get("number_of_epochs", None)
        concurrency_threads = data.get("concurrency_threads", None)
        data_synthesis_mode = data.get("data_synthesis_mode", None)
+        use_cuda = data.get("use_cuda", False)  # Default to False if not provided
        is_cot = data.get("is_cot", None)
        
        # Log the received parameters
@ -90,6 +92,7 @@ def start_process():
            "number_of_epochs": number_of_epochs,
            "concurrency_threads": concurrency_threads,
            "data_synthesis_mode": data_synthesis_mode,
+            "use_cuda": use_cuda,  # Make sure to include use_cuda parameter
            "is_cot": is_cot
        }
        
@ -113,6 +116,7 @@ def start_process():
                    "number_of_epochs": number_of_epochs,
                    "concurrency_threads": concurrency_threads,
                    "data_synthesis_mode": data_synthesis_mode,
+                    "use_cuda": use_cuda,  # Include in response
                    "is_cot": is_cot
                }
            )
--- a/lpm_kernel/api/services/local_llm_service.py
+++ b/lpm_kernel/api/services/local_llm_service.py
@ -4,6 +4,7 @@ import logging
 import psutil
 import time
 import subprocess
+import torch  # Add torch import for CUDA detection
 import threading
 import queue
 from typing import Iterator, Any, Optional, Generator, Dict
@ -38,9 +39,16 @@ class LocalLLMService:
            )
        return self._client

-    def start_server(self, model_path: str) -> bool:
+    def start_server(self, model_path: str, use_gpu: bool = True) -> bool:
        """
-        Start the llama-server service
+        Start the llama-server service with GPU acceleration when available
+        
+        Args:
+            model_path: Path to the GGUF model file
+            use_gpu: Whether to use GPU acceleration if available
+            
+        Returns:
+            bool: True if server started successfully, False otherwise
        """
        try:
            # Check if server is already running
@ -49,27 +57,163 @@ class LocalLLMService:
                logger.info("LLama server is already running")
                return True

-            # Start server
+            # Check for CUDA availability if GPU was requested
+            cuda_available = torch.cuda.is_available() if use_gpu else False
+            gpu_info = ""
+            
+            if use_gpu and cuda_available:
+                gpu_device = torch.cuda.current_device()
+                gpu_info = f" using GPU: {torch.cuda.get_device_name(gpu_device)}"
+                gpu_memory = torch.cuda.get_device_properties(gpu_device).total_memory / (1024**3)
+                
+                logger.info(f"CUDA is available. Using GPU acceleration{gpu_info}")
+                logger.info(f"CUDA device capabilities: {torch.cuda.get_device_capability(gpu_device)}")
+                logger.info(f"CUDA memory: {gpu_memory:.2f} GB")
+                
+                # Pre-initialize CUDA to speed up first inference
+                logger.info("Pre-initializing CUDA context to speed up first inference")
+                torch.cuda.init()
+                torch.cuda.empty_cache()
+            elif use_gpu and not cuda_available:
+                logger.warning("CUDA was requested but is not available. Using CPU instead.")
+            else:
+                logger.info("Using CPU for inference (GPU not requested)")
+
+            # Check for GPU optimization marker
+            gpu_optimized = False
+            model_dir = os.path.dirname(model_path)
+            gpu_marker_path = os.path.join(model_dir, "gpu_optimized.json")
+            if os.path.exists(gpu_marker_path):
+                try:
+                    with open(gpu_marker_path, 'r') as f:
+                        gpu_data = json.load(f)
+                        if gpu_data.get("gpu_optimized", False):
+                            gpu_optimized = True
+                            logger.info(f"Found GPU optimization marker created on {gpu_data.get('optimized_on', 'unknown date')}")
+                except Exception as e:
+                    logger.warning(f"Error reading GPU marker file: {e}")
+
+            # Get the correct path to the llama-server executable
+            base_dir = os.getcwd()
+            server_path = os.path.join(base_dir, "llama.cpp", "build", "bin", "llama-server")
+            
+            # For Windows, add .exe extension if needed
+            if os.name == 'nt' and not server_path.endswith('.exe'):
+                server_path += '.exe'
+                
+            # Verify executable exists
+            if not os.path.exists(server_path):
+                logger.error(f"llama-server executable not found at: {server_path}")
+                return False
+                
+            # Start server with optimal parameters for faster startup
            cmd = [
-                "llama-server",
+                server_path,
                "-m", model_path,
                "--host", "0.0.0.0",
-                "--port", "8000"
+                "--port", "8080",
+                "--ctx-size", "2048",     # Default context size (adjust based on needs)
+                "--parallel", "2",        # Enable request parallelism
+                "--cont-batching"         # Enable continuous batching
            ]
            
+            # Set up environment with CUDA variables to ensure GPU detection
+            env = os.environ.copy()
+            
+            # Add GPU-related parameters if CUDA is available
+            if cuda_available and use_gpu:
+                # Force GPU usage with optimal parameters for faster loads
+                cmd.extend([
+                    "--n-gpu-layers", "999",  # Use all layers on GPU
+                    "--tensor-split", "0",    # Use the first GPU for all operations
+                    "--main-gpu", "0",        # Use GPU 0 as the primary device
+                    "--mlock"                 # Lock memory to prevent swapping during inference
+                ])
+                
+                # Set CUDA environment variables to help with GPU detection
+                env["CUDA_VISIBLE_DEVICES"] = "0"  # Force using first GPU
+                
+                # Ensure comprehensive library paths for CUDA
+                cuda_lib_paths = [
+                    "/usr/local/cuda/lib64",
+                    "/usr/lib/cuda/lib64",
+                    "/usr/local/lib",
+                    "/usr/lib/x86_64-linux-gnu",
+                    "/usr/lib/wsl/lib"  # For Windows WSL environments
+                ]
+                
+                # Build a comprehensive LD_LIBRARY_PATH
+                current_ld_path = env.get("LD_LIBRARY_PATH", "")
+                for path in cuda_lib_paths:
+                    if os.path.exists(path) and path not in current_ld_path:
+                        current_ld_path = f"{path}:{current_ld_path}" if current_ld_path else path
+                
+                env["LD_LIBRARY_PATH"] = current_ld_path
+                logger.info(f"Setting LD_LIBRARY_PATH to: {current_ld_path}")
+                
+                # If this is Windows, use different approach for CUDA libraries
+                if os.name == 'nt':
+                    # Windows typically has CUDA in PATH already if installed
+                    logger.info("Windows system detected, using system CUDA libraries")
+                else:
+                    # On Linux, try to find CUDA libraries in common locations
+                    for cuda_path in [
+                        # Common CUDA paths
+                        "/usr/local/cuda/lib64",
+                        "/usr/lib/cuda/lib64",
+                        "/usr/local/lib/python3.12/site-packages/nvidia/cuda_runtime/lib",
+                        "/usr/local/lib/python3.10/site-packages/nvidia/cuda_runtime/lib",
+                    ]:
+                        if os.path.exists(cuda_path):
+                            # Add CUDA path to library path
+                            env["LD_LIBRARY_PATH"] = f"{cuda_path}:{env.get('LD_LIBRARY_PATH', '')}"
+                            env["CUDA_HOME"] = os.path.dirname(cuda_path)
+                            logger.info(f"Found CUDA at {cuda_path}, setting environment variables")
+                            break
+
+                # NOTE: CUDA support and rebuild should be handled at build/setup time (e.g., Docker build or setup script).
+                # The runtime check and rebuild logic has been removed for efficiency and reliability.
+                # Ensure llama.cpp is built with CUDA support before running the server if GPU is required.
+
+                # Pre-heat GPU to ensure faster initial response
+                if torch.cuda.is_available():
+                    logger.info("Pre-warming GPU to reduce initial latency...")
+                    dummy_tensor = torch.zeros(1, 1).cuda()
+                    del dummy_tensor
+                    torch.cuda.synchronize()
+                    torch.cuda.empty_cache()
+                    logger.info("GPU warm-up complete")
+                
+                logger.info("Using GPU acceleration for inference with optimized settings")
+            else:
+                # If GPU isn't available or supported, optimize for CPU
+                cmd.extend([
+                    "--threads", str(max(1, os.cpu_count() - 1)),  # Use all CPU cores except one
+                ])
+                logger.info(f"Using CPU-only mode with {max(1, os.cpu_count() - 1)} threads")
+            
+            logger.info(f"Starting llama-server with command: {' '.join(cmd)}")
+            
            process = subprocess.Popen(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
-                universal_newlines=True
+                universal_newlines=True,
+                env=env
            )
            
-            # Wait for server to start
-            time.sleep(2)
+            # Wait for server to start (longer wait for GPU initialization)
+            wait_time = 5 if cuda_available and use_gpu else 3
+            logger.info(f"Waiting {wait_time} seconds for server to start...")
+            time.sleep(wait_time)
            
-            # Check if process started successfully
+            # Check if process is still running
            if process.poll() is None:
-                logger.info("LLama server started successfully")
+                # Log initialization success
+                if cuda_available and use_gpu:
+                    logger.info(f"✅ LLama server started successfully with GPU acceleration{gpu_info}")
+                else:
+                    logger.info("✅ LLama server started successfully in CPU-only mode")
                return True
            else:
                stdout, stderr = process.communicate()
@ -151,10 +295,15 @@ class LocalLLMService:
        Returns: ServerStatus object
        """
        try:
+            base_dir = os.getcwd()
+            server_path = os.path.join(base_dir, "llama.cpp", "build", "bin", "llama-server")
+            server_exec_name = os.path.basename(server_path)
+            
            for proc in psutil.process_iter(["pid", "name", "cmdline"]):
                try:
                    cmdline = proc.cmdline()
-                    if any("llama-server" in cmd for cmd in cmdline):
+                    # Check both for the executable name and the full path
+                    if any(server_exec_name in cmd for cmd in cmdline) or any("llama-server" in cmd for cmd in cmdline):
                        with proc.oneshot():
                            process_info = ProcessInfo(
                                pid=proc.pid,
--- a/lpm_kernel/app.py
+++ b/lpm_kernel/app.py
@ -6,6 +6,7 @@ from .api.file_server.handler import FileServerHandler
 from .database.migration_manager import MigrationManager
 import os
 import atexit
+import subprocess


 def create_app():
--- a/lpm_kernel/common/llm.py
+++ b/lpm_kernel/common/llm.py
@ -7,6 +7,13 @@ from lpm_kernel.configs.logging import get_train_process_logger
 logger = get_train_process_logger()
 import lpm_kernel.common.strategy.classification as classification
 from sentence_transformers import SentenceTransformer
+import json
+
+class EmbeddingError(Exception):
+    """Custom exception class for embedding-related errors"""
+    def __init__(self, message, original_error=None):
+        super().__init__(message)
+        self.original_error = original_error

 class LLMClient:
    """LLM client utility class"""
@ -55,10 +62,8 @@ class LLMClient:

        user_llm_config = self.user_llm_config_service.get_available_llm()
        if not user_llm_config:
-
-            raise Exception("No LLM configuration found")
-        # Prepare request data
-
+            raise EmbeddingError("No LLM configuration found")
+        
        try:
            # Send request to embedding endpoint
            embeddings_array = classification.strategy_classification(user_llm_config, chunked_texts)
@ -81,7 +86,25 @@ class LLMClient:
            return embeddings_array

        except requests.exceptions.RequestException as e:
-            raise Exception(f"Failed to get embeddings: {str(e)}")
+            # Handle request errors
+            error_msg = f"Request error getting embeddings: {str(e)}"
+            logger.error(error_msg)
+            raise EmbeddingError(error_msg, e)
+        except json.JSONDecodeError as e:
+            # Handle JSON parsing errors
+            error_msg = f"Invalid JSON response from embedding API: {str(e)}"
+            logger.error(error_msg)
+            raise EmbeddingError(error_msg, e)
+        except (KeyError, IndexError, ValueError) as e:
+            # Handle response structure errors
+            error_msg = f"Invalid response structure from embedding API: {str(e)}"
+            logger.error(error_msg)
+            raise EmbeddingError(error_msg, e)
+        except Exception as e:
+            # Fallback for any other errors
+            error_msg = f"Unexpected error getting embeddings: {str(e)}"
+            logger.error(error_msg, exc_info=True)
+            raise EmbeddingError(error_msg, e)

    @property
    def chat_credentials(self):
--- a/lpm_kernel/file_data/document_service.py
+++ b/lpm_kernel/file_data/document_service.py
@ -148,6 +148,37 @@ class DocumentService:
            self._update_analyze_status_failed(doc.id)
            raise

+    def analyze_document(self, document_id: int) -> DocumentDTO:
+        """
+        Analyze a single document by ID
+        
+        Args:
+            document_id (int): ID of document to analyze
+            
+        Returns:
+            DocumentDTO: The analyzed document
+            
+        Raises:
+            ValueError: If document not found
+            Exception: If analysis fails
+        """
+        try:
+            # Get document
+            document = self._repository.find_one(document_id)
+            if not document:
+                raise ValueError(f"Document not found with id: {document_id}")
+                
+            # Perform analysis
+            return self._analyze_document(document)
+            
+        except ValueError as e:
+            logger.error(f"Document {document_id} not found: {str(e)}")
+            raise
+        except Exception as e:
+            logger.error(f"Error analyzing document {document_id}: {str(e)}", exc_info=True)
+            self._update_analyze_status_failed(document_id)
+            raise
+
    def _update_analyze_status_failed(self, doc_id: int) -> None:
        """update status as failed"""
        try:
@ -593,6 +624,120 @@ class DocumentService:
            logger.error(f"Error deleting file: {str(e)}", exc_info=True)
            raise

+    def fix_missing_document_analysis(self) -> int:
+        """Fix documents with missing insights or summaries
+        
+        Returns:
+            int: Number of documents fixed
+        """
+        try:
+            # Find all documents that have analysis issues
+            docs = self._repository.list()
+            fixed_count = 0
+            
+            for doc in docs:
+                needs_fixing = False
+                
+                # Check if document needs analysis
+                if not doc.analyze_status or doc.analyze_status != ProcessStatus.SUCCESS:
+                    needs_fixing = True
+                    logger.info(f"Document {doc.id} needs analysis (status: {doc.analyze_status})")
+                
+                # Check if document has missing insights or summaries
+                elif not doc.insight or not doc.summary:
+                    needs_fixing = True
+                    logger.info(f"Document {doc.id} has missing insight or summary")
+                
+                # Process documents that need fixing
+                if needs_fixing:
+                    try:
+                        # Process document analysis
+                        self.analyze_document(doc.id)
+                        fixed_count += 1
+                        logger.info(f"Fixed document {doc.id} analysis")
+                    except Exception as e:
+                        logger.error(f"Error fixing document {doc.id} analysis: {str(e)}")
+                
+            logger.info(f"Fixed {fixed_count} documents with missing analysis")
+            return fixed_count
+            
+        except Exception as e:
+            logger.error(f"Error in fix_missing_document_analysis: {str(e)}")
+            raise FileProcessingError(f"Failed to fix document analysis: {str(e)}")
+
+    def verify_document_embeddings(self, verbose=True) -> Dict:
+        """
+        Verify all document embeddings and return statistics
+        
+        Args:
+            verbose (bool): Whether to log detailed information
+            
+        Returns:
+            Dict: Statistics about document embeddings
+        """
+        try:
+            docs = self._repository.list()
+            results = {
+                "total_documents": len(docs),
+                "documents_with_embedding": 0,
+                "documents_without_embedding": 0,
+                "documents_with_content": 0,
+                "documents_without_content": 0,
+                "documents_with_summary": 0,
+                "documents_without_summary": 0,
+                "documents_with_insight": 0,
+                "documents_without_insight": 0,
+                "documents_needing_repair": 0,
+            }
+            
+            documents_needing_repair = []
+            
+            for doc in docs:
+                # Check if document has content
+                if doc.raw_content:
+                    results["documents_with_content"] += 1
+                else:
+                    results["documents_without_content"] += 1
+                    
+                # Check if document has summary
+                if doc.summary:
+                    results["documents_with_summary"] += 1
+                else:
+                    results["documents_without_summary"] += 1
+                    
+                # Check if document has insight
+                if doc.insight:
+                    results["documents_with_insight"] += 1
+                else:
+                    results["documents_without_insight"] += 1
+                
+                # Check if embeddings exist in ChromaDB
+                embedding = self.get_document_embedding(doc.id)
+                if embedding is not None:
+                    results["documents_with_embedding"] += 1
+                    if verbose:
+                        logger.info(f"Document {doc.id}: '{doc.name}' has embedding of dimension {len(embedding)}")
+                else:
+                    results["documents_without_embedding"] += 1
+                    if verbose:
+                        logger.warning(f"Document {doc.id}: '{doc.name}' missing embedding")
+                    
+                # Check if document needs repair (has content but missing embedding or analysis)
+                if doc.raw_content and (embedding is None or not doc.summary or not doc.insight):
+                    documents_needing_repair.append(doc.id)
+                    results["documents_needing_repair"] += 1
+                    
+            # Log statistics
+            logger.info(f"Document embedding verification results: {results}")
+            if documents_needing_repair and verbose:
+                logger.info(f"Documents needing repair: {documents_needing_repair}")
+                
+            return results
+            
+        except Exception as e:
+            logger.error(f"Error verifying document embeddings: {str(e)}", exc_info=True)
+            raise
+

 # create service
 document_service = DocumentService()
--- a/lpm_kernel/file_data/embedding_service.py
+++ b/lpm_kernel/file_data/embedding_service.py
@ -418,4 +418,4 @@ class EmbeddingService:
            raise
        except Exception as e:
            logger.error(f"Error searching similar chunks: {str(e)}")
-            raise
+            raise
--- a/lpm_kernel/train/training_params_manager.py
+++ b/lpm_kernel/train/training_params_manager.py
@ -23,6 +23,7 @@ class TrainingParamsManager:
        "number_of_epochs": 3,
        "concurrency_threads": 2,
        "data_synthesis_mode": "low",
+        "use_cuda": False,  # Default to using CUDA when available
        "is_cot": False
    }
    
--- a/lpm_kernel/train/trainprocess_service.py
+++ b/lpm_kernel/train/trainprocess_service.py
@ -42,7 +42,7 @@ class TrainProcessService:
    
    _instance = None
    _initialized = False
-
+    
    def __new__(cls, *args, **kwargs):
        if cls._instance is None:
            cls._instance = super().__new__(cls)
@ -633,6 +633,7 @@ class TrainProcessService:
            num_train_epochs = training_params.get("number_of_epochs")
            concurrency_threads = training_params.get("concurrency_threads")
            data_synthesis_mode = training_params.get("data_synthesis_mode")
+            use_cuda = training_params.get("use_cuda", False)
            is_cot = training_params.get("is_cot", False)
            
            # Log training parameters
@ -641,6 +642,8 @@ class TrainProcessService:
            logger.info(f"  Number of epochs: {num_train_epochs}")
            logger.info(f"  Concurrency threads: {concurrency_threads}")
            logger.info(f"  Data synthesis mode: {data_synthesis_mode}")
+            logger.info(f"  Use CUDA: {use_cuda}")
+            logger.info(f"  Is CoT: {is_cot}")
            
            # Prepare arguments for the script
            # Build command line arguments, need to include script path as the first parameter
@ -650,6 +653,7 @@ class TrainProcessService:
                "--epochs", str(num_train_epochs),
                "--threads", str(concurrency_threads),
                "--mode", str(data_synthesis_mode),
+                "--cuda", str(use_cuda),
                "--is_cot", str(is_cot)
            ]
            
--- a/scripts/prompt_cuda.bat
+++ b/scripts/prompt_cuda.bat
@ -0,0 +1,69 @@
+@echo off
+REM Script to prompt user for CUDA support preference
+
+echo === CUDA Support Selection ===
+echo.
+echo Do you want to build with NVIDIA GPU (CUDA) support?
+echo This requires an NVIDIA GPU and proper NVIDIA Docker runtime configuration.
+echo.
+set /p choice="Build with CUDA support? (y/n): "
+
+if /i "%choice%"=="y" goto cuda
+if /i "%choice%"=="yes" goto cuda
+goto nocuda
+
+:cuda
+echo Selected: Build WITH CUDA support
+
+REM Create or update .env file with the Dockerfile selection
+if exist .env (
+    REM Check if variable already exists in file
+    findstr /c:"DOCKER_BACKEND_DOCKERFILE" .env >nul
+    if %ERRORLEVEL% EQU 0 (
+        REM Update existing variable
+        powershell -Command "(Get-Content .env) -replace '^DOCKER_BACKEND_DOCKERFILE=.*', 'DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda' | Set-Content .env"
+    ) else (
+        REM Append to file with newline before
+        echo.>> .env
+        echo DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda>> .env
+    )
+) else (
+    REM Create new file
+    echo DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda> .env
+)
+
+REM Create a flag file to indicate GPU use
+echo GPU > .gpu_selected
+
+echo Environment set to build with CUDA support
+goto end
+
+:nocuda
+echo Selected: Build WITHOUT CUDA support (CPU only)
+
+REM Create or update .env file with the Dockerfile selection
+if exist .env (
+    REM Check if variable already exists in file
+    findstr /c:"DOCKER_BACKEND_DOCKERFILE" .env >nul
+    if %ERRORLEVEL% EQU 0 (
+        REM Update existing variable
+        powershell -Command "(Get-Content .env) -replace '^DOCKER_BACKEND_DOCKERFILE=.*', 'DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend' | Set-Content .env"
+    ) else (
+        REM Append to file with newline before
+        echo.>> .env
+        echo DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend>> .env
+    )
+) else (
+    REM Create new file
+    echo DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend> .env
+)
+
+REM Remove any GPU flag file if it exists
+if exist .gpu_selected (
+    del .gpu_selected
+)
+
+echo Environment set to build without CUDA support
+
+:end
+echo === CUDA Selection Complete ===
--- a/scripts/prompt_cuda.sh
+++ b/scripts/prompt_cuda.sh
@ -0,0 +1,62 @@
+#!/bin/bash
+# Script to prompt user for CUDA support preference and directly build with the appropriate Dockerfile
+
+echo "=== CUDA Support Selection ==="
+echo ""
+echo "Do you want to build with NVIDIA GPU (CUDA) support?"
+echo "This requires an NVIDIA GPU and proper NVIDIA Docker runtime configuration."
+echo ""
+read -p "Build with CUDA support? (y/n): " choice
+
+case "$choice" in
+  y|Y|yes|YES|Yes )
+    echo "Selected: Build WITH CUDA support"
+    
+    # Create or update .env file with the Dockerfile selection
+    if [ -f .env ]; then
+      # Update existing file
+      if grep -q "DOCKER_BACKEND_DOCKERFILE" .env; then
+        sed -i 's/^DOCKER_BACKEND_DOCKERFILE=.*/DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda/' .env
+      else
+        # Add a newline before appending new content
+        echo "" >> .env
+        echo "DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda" >> .env
+      fi
+    else
+      # Create new file
+      echo "DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda" > .env
+    fi
+    
+    # Create a flag file to indicate GPU use
+    echo "GPU" > .gpu_selected
+    
+    echo "Environment set to build with CUDA support"
+    ;;
+  * )
+    echo "Selected: Build WITHOUT CUDA support (CPU only)"
+    
+    # Create or update .env file with the Dockerfile selection
+    if [ -f .env ]; then
+      # Update existing file
+      if grep -q "DOCKER_BACKEND_DOCKERFILE" .env; then
+        sed -i 's/^DOCKER_BACKEND_DOCKERFILE=.*/DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend/' .env
+      else
+        # Add a newline before appending new content
+        echo "" >> .env
+        echo "DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend" >> .env
+      fi
+    else
+      # Create new file
+      echo "DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend" > .env
+    fi
+    
+    # Remove any GPU flag file if it exists
+    if [ -f .gpu_selected ]; then
+      rm .gpu_selected
+    fi
+    
+    echo "Environment set to build without CUDA support"
+    ;;
+esac
+
+echo "=== CUDA Selection Complete ==="