Added CUDA support (#228)

* Add CUDA support

- CUDA detection
- Memory handling
- Ollama model release after training

* Fix logging issue

added cuda support flag so log accurately reflected cuda toggle

* Update llama.cpp rebuild

Changed llama.cpp to only check if cuda support is enabled and if so rebuild during the first build rather than each run

* Improved vram management

Enabled memory pinning and optimizer state offload

* Fix CUDA check

rewrote llama.cpp rebuild logic, added manual y/n toggle if user wants to enable cuda support

* Added fast restart and fixed CUDA check command

Added make docker-restart-backend-fast to restart the backend and reflect code changes without causing a full llama.cpp rebuild

Fixed make docker-check-cuda command to correctly reflect cuda support

* Added docker-compose.gpu.yml

Added docker-compose.gpu.yml to fix error on machines without nvidia gpu and made sure "\n" is added before .env modification

* Fixed cuda toggle

Last push accidentally broke cuda toggle

* Code review fixes

Fixed errors resulting from removed code:
- Added return save_path to end of save_hf_model function
- Rolled back download_file_with_progress function

* Update Makefile

Use cuda by default when using docker-restart-backend-fast

* Minor cleanup

Removed unnecessary makefile command and fixed gpu logging

* Delete .gpu_selected

* Simplified cuda training code

- Removed dtype setting to let torch automatically handle it
- Removed vram logging
- Removed Unnecessary/old comments

* Fixed gpu/cpu selection

Made "make docker-use-gpu/cpu" command work with .gpu_selected flag and changed "make docker-restart-backend-fast" command to respect flag instead of always using gpu

* Fix Ollama embedding error

Added custom exception class for Ollama embeddings, which seemed to be returning keyword arguments while the Python exception class only accepts positional ones

* Fixed model selection & memory error

Fixed training defaulting to 0.5B model regardless of selection and fixed "free(): double free detected in tcache 2" error caused by cuda flag being passed incorrectly
This commit is contained in:
Zachary Pitroda 2025-04-24 22:20:36 -04:00 committed by GitHub
parent 71d54a5b0b
commit 053090937d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
36 changed files with 2375 additions and 507 deletions

2
.env
View File

@ -65,3 +65,5 @@ DOCUMENT_CHUNK_OVERLAP=200
# Embedding configurations
EMBEDDING_MAX_TEXT_LENGTH=3072
DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda

5
.gitignore vendored
View File

@ -75,3 +75,8 @@ run/*
.backend.pid
.frontend.pid
logs/train/
llama_cpp_backup/llama.cpp.zip
scripts/check_cuda_status.ps1
scripts/test_cuda_detection.bat
.env
.gpu_selected

View File

@ -1,3 +1,6 @@
# Base image selection is handled by the Makefile or build script
# For CUDA support: FROM nvidia/cuda:12.8.1-devel-ubuntu22.04
# For CPU-only: FROM python:3.12
FROM python:3.12
# Set working directory
@ -19,28 +22,31 @@ RUN mkdir -p /app/dependencies /app/data/sqlite /app/data/chroma_db /app/logs /a
COPY dependencies/graphrag-1.2.1.dev27.tar.gz /app/dependencies/
COPY dependencies/llama.cpp.zip /app/dependencies/
# Build llama.cpp
# Copy GPU checker script (only used for status reporting, not rebuilding)
COPY docker/app/check_gpu_support.sh /app/
COPY docker/app/check_torch_cuda.py /app/
RUN chmod +x /app/check_gpu_support.sh
# Unpack prebuilt llama.cpp CPU binary (no build or GPU detection here)
RUN LLAMA_LOCAL_ZIP="dependencies/llama.cpp.zip" \
&& echo "Using local llama.cpp archive..." \
&& unzip -q "$LLAMA_LOCAL_ZIP" \
&& cd llama.cpp \
&& mkdir -p build && cd build \
&& cmake .. \
&& cmake --build . --config Release \
&& if [ ! -f "bin/llama-server" ]; then \
echo "Build failed: llama-server executable not found" && exit 1; \
else \
echo "Successfully built llama-server"; \
fi
&& mkdir -p build \
&& cp -r bin build/ 2>/dev/null || true \
&& chmod +x /app/llama.cpp/build/bin/llama-server /app/llama.cpp/build/bin/llama-cli 2>/dev/null || true
# Mark as CPU-only build for runtime reference
RUN mkdir -p /app/data && \
echo "{ \"gpu_optimized\": false, \"optimized_on\": \"$(date -u +\"%Y-%m-%dT%H:%M:%SZ\")\" }" > /app/data/gpu_optimized.json && \
echo "Created CPU-only marker file"
#
# Copy project configuration - Files that occasionally change
COPY pyproject.toml README.md /app/
RUN poetry install --no-interaction --no-root
RUN pip install --force-reinstall dependencies/graphrag-1.2.1.dev27.tar.gz
# Copy source code - Files that frequently change
COPY docker/ /app/docker/
COPY lpm_kernel/ /app/lpm_kernel/
@ -61,5 +67,5 @@ ENV PYTHONUNBUFFERED=1 \
# Expose ports
EXPOSE 8002 8080
# Set the startup command
CMD ["bash", "-c", "echo \"Checking SQLite database...\" && if [ ! -s /app/data/sqlite/lpm.db ]; then echo \"SQLite database not found or empty, initializing...\" && mkdir -p /app/data/sqlite && sqlite3 /app/data/sqlite/lpm.db \".read /app/docker/sqlite/init.sql\" && echo \"SQLite database initialized successfully\" && echo \"Tables created:\" && sqlite3 /app/data/sqlite/lpm.db \".tables\"; else echo \"SQLite database already exists, skipping initialization\"; fi && echo \"Checking ChromaDB...\" && if [ ! -d /app/data/chroma_db/documents ] || [ ! -d /app/data/chroma_db/document_chunks ]; then echo \"ChromaDB collections not found, initializing...\" && python /app/docker/app/init_chroma.py && echo \"ChromaDB initialized successfully\"; else echo \"ChromaDB already exists, skipping initialization\"; fi && echo \"Starting application at $(date)\" >> /app/logs/backend.log && cd /app && python -m flask run --host=0.0.0.0 --port=${LOCAL_APP_PORT:-8002} >> /app/logs/backend.log 2>&1"]
# Set the startup command - CUDA check/rebuild removed since it's now handled at build time
CMD ["bash", "-c", "echo 'Checking SQLite database...' && if [ ! -s /app/data/sqlite/lpm.db ]; then echo 'SQLite database not found or empty, initializing...' && mkdir -p /app/data/sqlite && sqlite3 /app/data/sqlite/lpm.db '.read /app/docker/sqlite/init.sql' && echo 'SQLite database initialized successfully' && echo 'Tables created:' && sqlite3 /app/data/sqlite/lpm.db '.tables'; else echo 'SQLite database already exists, skipping initialization'; fi && echo 'Checking ChromaDB...' && if [ ! -d /app/data/chroma_db/documents ] || [ ! -d /app/data/chroma_db/document_chunks ]; then echo 'ChromaDB collections not found, initializing...' && python /app/docker/app/init_chroma.py && echo 'ChromaDB initialized successfully'; else echo 'ChromaDB already exists, skipping initialization'; fi && echo 'Starting application at ' $(date) >> /app/logs/backend.log && cd /app && python -m flask run --host=0.0.0.0 --port=${LOCAL_APP_PORT:-8002} >> /app/logs/backend.log 2>&1"]

111
Dockerfile.backend.cuda Normal file
View File

@ -0,0 +1,111 @@
FROM nvidia/cuda:12.8.1-devel-ubuntu24.04
# Set working directory
WORKDIR /app
# Add build argument to conditionally skip llama.cpp build
ARG SKIP_LLAMA_BUILD=false
# Install system dependencies with noninteractive mode to avoid prompts
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
build-essential cmake git curl wget lsof vim unzip sqlite3 \
python3-pip python3-venv python3-full python3-poetry pipx \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* \
&& ln -sf /usr/bin/python3 /usr/bin/python
# Create a virtual environment to avoid PEP 668 restrictions
RUN python -m venv /app/venv
ENV PATH="/app/venv/bin:$PATH"
ENV VIRTUAL_ENV="/app/venv"
# Use the virtual environment's pip to install packages
RUN pip install --upgrade pip \
&& pip install poetry \
&& poetry config virtualenvs.create false
# Create directories
RUN mkdir -p /app/dependencies /app/data/sqlite /app/data/chroma_db /app/logs /app/run /app/resources
# Copy dependency files - Files that rarely change
COPY dependencies/graphrag-1.2.1.dev27.tar.gz /app/dependencies/
COPY dependencies/llama.cpp.zip /app/dependencies/
# Copy GPU checker script
COPY docker/app/check_gpu_support.sh /app/
COPY docker/app/check_torch_cuda.py /app/
RUN chmod +x /app/check_gpu_support.sh
# Unpack llama.cpp and build with CUDA support (conditionally, based on SKIP_LLAMA_BUILD)
RUN if [ "$SKIP_LLAMA_BUILD" = "false" ]; then \
echo "=====================================================================" && \
echo "STARTING LLAMA.CPP BUILD WITH CUDA SUPPORT - THIS WILL TAKE SOME TIME" && \
echo "=====================================================================" && \
LLAMA_LOCAL_ZIP="dependencies/llama.cpp.zip" && \
echo "Using local llama.cpp archive..." && \
unzip -q "$LLAMA_LOCAL_ZIP" && \
cd llama.cpp && \
mkdir -p build && \
cd build && \
echo "Starting CMake configuration with CUDA support..." && \
cmake -DGGML_CUDA=ON \
-DCMAKE_BUILD_TYPE=Release \
-DBUILD_SHARED_LIBS=OFF \
-DLLAMA_NATIVE=OFF \
-DCMAKE_CUDA_FLAGS="-Wno-deprecated-gpu-targets" \
.. && \
echo "Starting build process (this will take several minutes)..." && \
cmake --build . --config Release -j --verbose && \
echo "Build completed successfully" && \
chmod +x /app/llama.cpp/build/bin/llama-server /app/llama.cpp/build/bin/llama-cli && \
echo "====================================================================" && \
echo "CUDA BUILD COMPLETED SUCCESSFULLY! GPU ACCELERATION IS NOW AVAILABLE" && \
echo "===================================================================="; \
else \
echo "=====================================================================" && \
echo "SKIPPING LLAMA.CPP BUILD (SKIP_LLAMA_BUILD=$SKIP_LLAMA_BUILD)" && \
echo "Using existing llama.cpp build from Docker volume" && \
echo "=====================================================================" && \
LLAMA_LOCAL_ZIP="dependencies/llama.cpp.zip" && \
echo "Just unpacking llama.cpp archive (no build)..." && \
unzip -q "$LLAMA_LOCAL_ZIP" && \
cd llama.cpp && \
mkdir -p build; \
fi
# Mark as GPU-optimized build for runtime reference
RUN mkdir -p /app/data && \
echo "{ \"gpu_optimized\": true, \"optimized_on\": \"$(date -u +\"%Y-%m-%dT%H:%M:%SZ\")\" }" > /app/data/gpu_optimized.json && \
echo "Created GPU-optimized marker file"
# Copy project configuration - Files that occasionally change
COPY pyproject.toml README.md /app/
# Fix for potential package installation issues with Poetry
RUN pip install --upgrade setuptools wheel
RUN poetry install --no-interaction --no-root || poetry install --no-interaction --no-root --without dev
RUN pip install --force-reinstall dependencies/graphrag-1.2.1.dev27.tar.gz
# Copy source code - Files that frequently change
COPY docker/ /app/docker/
COPY lpm_kernel/ /app/lpm_kernel/
# Check module import
RUN python -c "import lpm_kernel; print('Module import check passed')"
# Set environment variables
ENV PYTHONUNBUFFERED=1 \
PYTHONPATH=/app \
BASE_DIR=/app/data \
LOCAL_LOG_DIR=/app/logs \
RUN_DIR=/app/run \
RESOURCES_DIR=/app/resources \
APP_ROOT=/app \
FLASK_APP=lpm_kernel.app \
LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
# Expose ports
EXPOSE 8002 8080
# Set the startup command
CMD ["bash", "-c", "echo 'Checking SQLite database...' && if [ ! -s /app/data/sqlite/lpm.db ]; then echo 'SQLite database not found or empty, initializing...' && mkdir -p /app/data/sqlite && sqlite3 /app/data/sqlite/lpm.db '.read /app/docker/sqlite/init.sql' && echo 'SQLite database initialized successfully' && echo 'Tables created:' && sqlite3 /app/data/sqlite/lpm.db '.tables'; else echo 'SQLite database already exists, skipping initialization'; fi && echo 'Checking ChromaDB...' && if [ ! -d /app/data/chroma_db/documents ] || [ ! -d /app/data/chroma_db/document_chunks ]; then echo 'ChromaDB collections not found, initializing...' && python /app/docker/app/init_chroma.py && echo 'ChromaDB initialized successfully'; else echo 'ChromaDB already exists, skipping initialization'; fi && echo 'Starting application at ' $(date) >> /app/logs/backend.log && cd /app && python -m flask run --host=0.0.0.0 --port=${LOCAL_APP_PORT:-8002} >> /app/logs/backend.log 2>&1"]

141
Makefile
View File

@ -1,4 +1,13 @@
.PHONY: install test format lint all setup start stop restart restart-backend restart-force help docker-build docker-up docker-down docker-build-backend docker-build-frontend docker-restart-backend docker-restart-frontend docker-restart-all
.PHONY: install test format lint all setup start stop restart restart-backend restart-force help docker-build docker-up docker-down docker-build-backend docker-build-frontend docker-restart-backend docker-restart-backend-fast docker-restart-backend-smart docker-restart-frontend docker-restart-all docker-check-cuda docker-use-gpu docker-use-cpu
# Check for GPU flag file and set Docker Compose file accordingly
ifeq ($(wildcard .gpu_selected),)
# No GPU flag file found, use CPU configuration
DOCKER_COMPOSE_FILE := docker-compose.yml
else
# GPU flag file found, use GPU configuration
DOCKER_COMPOSE_FILE := docker-compose-gpu.yml
endif
# Detect operating system and set environment
ifeq ($(OS),Windows_NT)
@ -39,6 +48,9 @@ else
COLOR_RED := \033[1;31m
endif
# Default Docker Compose configuration (non-GPU)
DOCKER_COMPOSE_FILE := docker-compose.yml
# Show help message
help:
ifeq ($(WINDOWS),1)
@ -69,8 +81,12 @@ ifeq ($(WINDOWS),1)
@echo make docker-build-backend - Build only backend Docker image
@echo make docker-build-frontend - Build only frontend Docker image
@echo make docker-restart-backend - Restart only backend container
@echo make docker-restart-backend-fast - Restart backend+cuda without rebuilding llama.cpp
@echo make docker-restart-frontend - Restart only frontend container
@echo make docker-restart-all - Restart all Docker containers
@echo make docker-check-cuda - Check CUDA support in containers
@echo make docker-use-gpu - Switch to GPU configuration
@echo make docker-use-cpu - Switch to CPU-only configuration
@echo.
@echo All Available Commands:
@echo make help - Show this help message
@ -106,9 +122,13 @@ else
@echo " make docker-down - Stop all Docker containers"
@echo " make docker-build-backend - Build only backend Docker image"
@echo " make docker-build-frontend - Build only frontend Docker image"
@echo " make docker-restart-backend - Restart only backend container"
@echo " make docker-restart-backend - Restart only backend container (with rebuild)"
@echo " make docker-restart-backend-fast - Restart backend+cuda without rebuilding llama.cpp"
@echo " make docker-restart-frontend - Restart only frontend container"
@echo " make docker-restart-all - Restart all Docker containers"
@echo " make docker-check-cuda - Check CUDA support in containers"
@echo " make docker-use-gpu - Switch to GPU configuration"
@echo " make docker-use-cpu - Switch to CPU-only configuration"
@echo ""
@echo "$(COLOR_BOLD)All Available Commands:$(COLOR_RESET)"
@echo " make help - Show this help message"
@ -124,6 +144,27 @@ else
fi
endif
# Configuration switchers for Docker
docker-use-gpu:
@echo "Switching to GPU configuration..."
ifeq ($(WINDOWS),1)
@echo GPU mode enabled. Docker commands will use docker-compose-gpu.yml
@echo gpu > .gpu_selected
else
@echo "$(COLOR_GREEN)GPU mode enabled. Docker commands will use docker-compose-gpu.yml$(COLOR_RESET)"
@echo "gpu" > .gpu_selected
endif
docker-use-cpu:
@echo "Switching to CPU-only configuration..."
ifeq ($(WINDOWS),1)
@echo CPU-only mode enabled. Docker commands will use docker-compose.yml
@rm -f .gpu_selected
else
@echo "$(COLOR_GREEN)CPU-only mode enabled. Docker commands will use docker-compose.yml$(COLOR_RESET)"
@rm -f .gpu_selected
endif
setup:
./scripts/setup.sh
@ -156,37 +197,99 @@ DOCKER_COMPOSE_CMD := $(shell if command -v docker-compose >/dev/null 2>&1; then
endif
docker-build:
$(DOCKER_COMPOSE_CMD) build
ifeq ($(WINDOWS),1)
@echo "Prompting for CUDA preference..."
@scripts\prompt_cuda.bat
else
@echo "Prompting for CUDA preference..."
@chmod +x ./scripts/prompt_cuda.sh
@./scripts/prompt_cuda.sh
endif
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build
docker-up:
$(DOCKER_COMPOSE_CMD) up -d
@echo "Building and starting Docker containers..."
ifeq ($(WINDOWS),1)
@echo "Prompting for CUDA preference..."
@scripts\prompt_cuda.bat
@echo "Checking CUDA preference..."
@cmd /c "if exist .gpu_selected ( echo CUDA support detected, using GPU configuration... & docker compose -f docker-compose-gpu.yml build --no-cache & docker compose -f docker-compose-gpu.yml up -d ) else ( echo No CUDA support selected, using CPU-only configuration... & docker compose -f docker-compose.yml build --no-cache & docker compose -f docker-compose.yml up -d )"
else
@echo "Prompting for CUDA preference..."
@chmod +x ./scripts/prompt_cuda.sh
@./scripts/prompt_cuda.sh
@echo "Checking CUDA preference..."
@if [ -f .gpu_selected ]; then \
echo "CUDA support detected, using GPU configuration..."; \
$(DOCKER_COMPOSE_CMD) -f docker-compose-gpu.yml build; \
$(DOCKER_COMPOSE_CMD) -f docker-compose-gpu.yml up -d; \
else \
echo "No CUDA support selected, using CPU-only configuration..."; \
$(DOCKER_COMPOSE_CMD) -f docker-compose.yml build; \
$(DOCKER_COMPOSE_CMD) -f docker-compose.yml up -d; \
fi
endif
@echo "Container startup complete"
@echo "Check CUDA support with: make docker-check-cuda"
docker-down:
$(DOCKER_COMPOSE_CMD) down
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) down
docker-build-backend:
$(DOCKER_COMPOSE_CMD) build backend
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build backend
docker-build-frontend:
$(DOCKER_COMPOSE_CMD) build frontend
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build frontend
# Standard backend restart with complete rebuild
docker-restart-backend:
$(DOCKER_COMPOSE_CMD) stop backend
$(DOCKER_COMPOSE_CMD) rm -f backend
$(DOCKER_COMPOSE_CMD) build backend || { echo "$(COLOR_RED)❌ Backend build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
$(DOCKER_COMPOSE_CMD) up -d backend
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) stop backend
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) rm -f backend
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build backend || { echo "$(COLOR_RED)❌ Backend build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) up -d backend
# Fast backend restart: preserves llama.cpp build
docker-restart-backend-fast:
@echo "Smart restarting backend container (preserving llama.cpp build)..."
@echo "Stopping backend container..."
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) stop backend
@echo "Removing backend container..."
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) rm -f backend
@echo "Building backend image with build-arg to skip llama.cpp build..."
ifeq ($(wildcard .gpu_selected),)
@echo "Using CPU configuration (docker-compose.yml)..."
else
@echo "Using GPU configuration (docker-compose-gpu.yml)..."
endif
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build --build-arg SKIP_LLAMA_BUILD=true backend || { echo "$(COLOR_RED)❌ Backend build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
@echo "Starting backend container..."
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) up -d backend
@echo "Backend container smart-restarted successfully"
@echo "Check CUDA support with: make docker-check-cuda"
docker-restart-frontend:
$(DOCKER_COMPOSE_CMD) stop frontend
$(DOCKER_COMPOSE_CMD) rm -f frontend
$(DOCKER_COMPOSE_CMD) build frontend || { echo "$(COLOR_RED)❌ Frontend build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
$(DOCKER_COMPOSE_CMD) up -d frontend
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) stop frontend
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) rm -f frontend
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build frontend || { echo "$(COLOR_RED)❌ Frontend build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) up -d frontend
docker-restart-all:
$(DOCKER_COMPOSE_CMD) stop
$(DOCKER_COMPOSE_CMD) rm -f
$(DOCKER_COMPOSE_CMD) build || { echo "$(COLOR_RED)❌ Build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
$(DOCKER_COMPOSE_CMD) up -d
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) stop
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) rm -f
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build || { echo "$(COLOR_RED)❌ Build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) up -d
# New command to check CUDA support in containers
docker-check-cuda:
@echo "Checking CUDA support in Docker containers..."
ifeq ($(WINDOWS),1)
@echo Running CUDA support check in backend container
@docker exec second-me-backend /app/check_gpu_support.sh || echo No GPU support detected in backend container
else
@echo "$(COLOR_CYAN)Running CUDA support check in backend container:$(COLOR_RESET)"
@docker exec second-me-backend /app/check_gpu_support.sh || echo "$(COLOR_RED)No GPU support detected in backend container$(COLOR_RESET)"
endif
install:
poetry install

74
docker-compose-gpu.yml Normal file
View File

@ -0,0 +1,74 @@
services:
backend:
build:
context: .
dockerfile: ${DOCKER_BACKEND_DOCKERFILE:-Dockerfile.backend.cuda}
container_name: second-me-backend
restart: unless-stopped
ports:
- "8002:8002"
- "8080:8080"
volumes:
- ./data:/app/data
- ./logs:/app/logs
- ./run:/app/run
- ./resources:/app/resources
- ./docker:/app/docker
- ./.env:/app/.env
- llama-cpp-build:/app/llama.cpp/build # Persist the llama.cpp build
environment:
# Environment variables
- LOCAL_APP_PORT=8002
- IN_DOCKER_ENV=1
- PLATFORM=${PLATFORM:-linux}
- USE_CUDA=1
extra_hosts:
- "host.docker.internal:host-gateway"
deploy:
resources:
limits:
# Set container memory limit to 64GB
memory: 64G
reservations:
# Memory reservation
memory: 6G
devices:
- driver: nvidia
count: all
capabilities: [gpu]
networks:
- second-me-network
frontend:
build:
context: .
dockerfile: Dockerfile.frontend
container_name: second-me-frontend
restart: unless-stopped
ports:
- "3000:3000"
volumes:
- ./logs:/app/logs
- ./resources:/app/resources
environment:
- VITE_API_BASE_URL=http://backend:8002
depends_on:
- backend
deploy:
resources:
limits:
# Set container memory limit to 2GB
memory: 2G
reservations:
# Memory reservation
memory: 1G
networks:
- second-me-network
networks:
second-me-network:
driver: bridge
volumes:
llama-cpp-build:
driver: local

View File

@ -15,6 +15,7 @@ services:
- ./resources:/app/resources
- ./docker:/app/docker
- ./.env:/app/.env
- llama-cpp-build:/app/llama.cpp/build # Persist the llama.cpp build
environment:
# Environment variables
- LOCAL_APP_PORT=8002
@ -62,3 +63,7 @@ services:
networks:
second-me-network:
driver: bridge
volumes:
llama-cpp-build:
driver: local

View File

@ -0,0 +1,57 @@
#!/bin/bash
# Helper script to check if GPU support is available at runtime
echo "=== GPU Support Check ==="
# Check if llama-server binary exists and is linked to CUDA libraries
if [ -f "/app/llama.cpp/build/bin/llama-server" ]; then
echo "llama-server binary found, checking for CUDA linkage..."
CUDA_LIBS=$(ldd /app/llama.cpp/build/bin/llama-server | grep -i "cuda\|nvidia")
if [ -n "$CUDA_LIBS" ]; then
echo "✅ llama-server is built with CUDA support:"
echo "$CUDA_LIBS"
echo "GPU acceleration is available"
# Check for GPU optimization marker file (optional, not required)
GPU_MARKER_FILE="/app/data/gpu_optimized.json"
if [ -f "$GPU_MARKER_FILE" ]; then
GPU_OPTIMIZED=$(grep -o '"gpu_optimized": *true' "$GPU_MARKER_FILE" || echo "false")
OPTIMIZED_DATE=$(grep -o '"optimized_on": *"[^"]*"' "$GPU_MARKER_FILE" | cut -d'"' -f4)
if [[ "$GPU_OPTIMIZED" == *"true"* ]]; then
echo "📝 GPU-optimized build marker found (built on: $OPTIMIZED_DATE)"
else
echo "📝 GPU marker file found but not marked as optimized (built on: $OPTIMIZED_DATE)"
fi
else
echo "📝 No GPU optimization marker file found, but CUDA support is detected in binary"
fi
# Check if NVIDIA GPU is accessible at runtime
if nvidia-smi &>/dev/null; then
echo "🔍 NVIDIA GPU is available at runtime"
echo "=== GPU ACCELERATION IS READY TO USE ==="
exit 0
else
echo "⚠️ WARNING: llama-server has CUDA support, but NVIDIA GPU is not accessible"
echo "Check that Docker is running with GPU access (--gpus all)"
exit 1
fi
else
echo "❌ llama-server is not linked with CUDA libraries"
echo "Container was built without CUDA support"
fi
else
echo "❌ llama-server binary not found at /app/llama.cpp/build/bin/llama-server"
fi
# Final check for GPU hardware
if nvidia-smi &>/dev/null; then
echo "🔍 NVIDIA GPU is available at runtime, but llama-server doesn't support CUDA"
echo "To enable GPU support, rebuild using: make docker-up (and select CUDA support when prompted)"
exit 1
else
echo "❌ No NVIDIA GPU detected at runtime"
exit 1
fi

View File

@ -0,0 +1,52 @@
#!/usr/bin/env python3
import torch
import subprocess
import sys
import os
print("=== PyTorch CUDA Version Information ===")
print(f"PyTorch version: {torch.__version__}")
if torch.cuda.is_available():
print(f"CUDA available: Yes")
print(f"CUDA version used by PyTorch: {torch.version.cuda}")
print(f"cuDNN version: {torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else 'Not available'}")
print(f"GPU device name: {torch.cuda.get_device_name(0)}")
# Try to check system CUDA version
try:
nvcc_output = subprocess.check_output(["nvcc", "--version"]).decode("utf-8")
print("\nSystem NVCC version:")
print(nvcc_output)
except:
print("\nNVCC not found in PATH")
# Check CUDA libraries
try:
print("\nChecking required CUDA libraries:")
for lib in ["libcudart.so", "libcublas.so", "libcublasLt.so"]:
print(f"\nSearching for {lib}:")
find_result = subprocess.run(f"find /usr -name '{lib}*'", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if find_result.returncode == 0 and find_result.stdout:
print(find_result.stdout.decode("utf-8"))
else:
print(f"No {lib} found in /usr")
except Exception as e:
print(f"Error checking libraries: {e}")
# Check LD_LIBRARY_PATH
print("\nLD_LIBRARY_PATH:")
print(os.environ.get("LD_LIBRARY_PATH", "Not set"))
else:
print("CUDA not available")
# Check system CUDA installation
print("\n=== System CUDA Information ===")
try:
nvidia_smi = subprocess.check_output(["nvidia-smi"]).decode("utf-8")
print("NVIDIA-SMI output:")
print(nvidia_smi)
except:
print("nvidia-smi not found or not working")

View File

@ -0,0 +1,132 @@
#!/bin/bash
# Script to rebuild llama.cpp with CUDA support at runtime
# This ensures the build happens with full knowledge of the GPU environment
set -e # Exit on error but don't print each command (for cleaner logs)
cd /app
echo "========== STARTING LLAMA.CPP CUDA REBUILD PROCESS =========="
echo "Current directory: $(pwd)"
# First check if CUDA is actually available in the container
echo "Verifying NVIDIA drivers and CUDA availability..."
if ! command -v nvidia-smi &> /dev/null; then
echo "WARNING: NVIDIA drivers not found. Cannot build with CUDA support!"
echo "Make sure the container has access to the GPU and NVIDIA Container Toolkit is installed."
echo "Consider running Docker with: --gpus all"
exit 0 # Exit without error as there's no point trying to build with CUDA when no GPU is detected
fi
# Run nvidia-smi to check GPU access
echo "Detected NVIDIA GPU:"
nvidia-smi || {
echo "ERROR: nvidia-smi command failed. GPU is not properly accessible from the container."
echo "Make sure you're running Docker with GPU access enabled (--gpus all)"
exit 0 # Exit without error since there's no GPU access
}
# Install build dependencies
echo "Installing build dependencies..."
apt-get update && apt-get install -y --no-install-recommends \
build-essential \
wget \
cmake \
git \
ca-certificates \
gnupg \
libopenblas-dev
# Clean up apt cache to free space
apt-get clean
rm -rf /var/lib/apt/lists/*
# Install CUDA using NVIDIA's official Debian 12 network installation method
echo "Installing CUDA using NVIDIA's official method for Debian 12..."
wget https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb
dpkg -i cuda-keyring_1.1-1_all.deb
rm cuda-keyring_1.1-1_all.deb
apt-get update
# Install CUDA packages needed for building llama.cpp with CUDA support
apt-get install -y --fix-missing --no-install-recommends cuda-compiler-12-8
apt-get clean
rm -rf /var/lib/apt/lists/*
apt-get update
apt-get install -y --fix-missing --no-install-recommends cuda-runtime-12-8
apt-get clean
rm -rf /var/lib/apt/lists/*
apt-get update
apt-get install -y --fix-missing --no-install-recommends cuda-libraries-dev-12-8
apt-get clean
rm -rf /var/lib/apt/lists/*
# Set up environment for build
export PATH=/usr/local/cuda-12.8/bin:${PATH}
export LD_LIBRARY_PATH=/usr/local/cuda-12.8/lib64:${LD_LIBRARY_PATH}
export CUDA_HOME=/usr/local/cuda-12.8
# Set CUDACXX environment variable explicitly to help CMake find the CUDA compiler
export CUDACXX=/usr/local/cuda-12.8/bin/nvcc
export CMAKE_CUDA_COMPILER=/usr/local/cuda-12.8/bin/nvcc
# Verify CUDA compiler is available
echo "Verifying CUDA compiler (nvcc) is available:"
which nvcc || echo "ERROR: nvcc not found in PATH!"
nvcc --version || echo "ERROR: nvcc not working properly!"
echo "CUDA environment:"
echo "- CUDA_HOME: $CUDA_HOME"
echo "- CUDACXX: $CUDACXX"
echo "- CMAKE_CUDA_COMPILER: $CMAKE_CUDA_COMPILER"
echo "- PATH includes CUDA: $PATH"
echo "- LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
# Show available disk space
echo "Available disk space:"
df -h
# Use local build approach to avoid volume mount issues
echo "Building llama.cpp with CUDA in a local directory..."
cd /tmp
rm -rf llama_build
mkdir -p llama_build
cd llama_build
# Clone a fresh copy of llama.cpp - this avoids volume mount issues
echo "Cloning fresh copy of llama.cpp..."
git clone https://github.com/ggerganov/llama.cpp.git .
# Configure and build with CUDA support
mkdir -p build
cd build
echo "Configuring with CMake..."
cmake -DGGML_CUDA=ON \
-DCMAKE_CUDA_ARCHITECTURES=all \
-DCMAKE_BUILD_TYPE=Release \
-DBUILD_SHARED_LIBS=OFF \
-DLLAMA_NATIVE=OFF \
-DCMAKE_CUDA_FLAGS="-Wno-deprecated-gpu-targets" \
..
echo "Building llama.cpp with CUDA support..."
cmake --build . --config Release --target all -j $(nproc)
if [ -f "bin/llama-server" ]; then
echo "Build successful! Copying binaries to /app/llama.cpp/build/bin/"
mkdir -p /app/llama.cpp/build/bin
cp bin/llama-server /app/llama.cpp/build/bin/
cp bin/llama-cli /app/llama.cpp/build/bin/ 2>/dev/null || true
chmod +x /app/llama.cpp/build/bin/llama-server /app/llama.cpp/build/bin/llama-cli
# Create GPU optimized marker
echo "{ \"gpu_optimized\": true, \"optimized_on\": \"$(date -u +\"%Y-%m-%dT%H:%M:%SZ\")\" }" > /app/data/gpu_optimized.json
echo "Testing CUDA support in built binary..."
LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH /app/llama.cpp/build/bin/llama-server --version
echo ""
echo "========== CUDA BUILD COMPLETED SUCCESSFULLY =========="
else
echo "ERROR: Build failed - llama-server executable not found!"
exit 1
fi

0
logs/logs.lnk Normal file
View File

View File

@ -4,7 +4,7 @@ import { useState, useEffect, useRef } from 'react';
import { useRouter } from 'next/navigation';
import InfoModal from '@/components/InfoModal';
import type { TrainingConfig } from '@/service/train';
import { startTrain, stopTrain, retrain, getTrainingParams, resetProgress } from '@/service/train';
import { startTrain, stopTrain, retrain, getTrainingParams, checkCudaAvailability, resetProgress } from '@/service/train';
import { useTrainingStore } from '@/store/useTrainingStore';
import { getMemoryList } from '@/service/memory';
import { message, Modal } from 'antd';
@ -90,6 +90,7 @@ export default function TrainingPage() {
const firstLoadRef = useRef<boolean>(true);
const pollingStopRef = useRef<boolean>(false);
const [cudaAvailable, setCudaAvailable] = useState<boolean>(false);
const [isResume, setIsResume] = useState(
trainingProgress.status === 'suspended' || trainingProgress.status === 'failed'
);
@ -104,6 +105,29 @@ export default function TrainingPage() {
fetchModelConfig();
}, []);
useEffect(() => {
// Check CUDA availability once on load
checkCudaAvailability()
.then(res => {
if (res.data.code === 0) {
const { cuda_available, cuda_info } = res.data.data;
setCudaAvailable(cuda_available);
if (cuda_available) {
console.log('CUDA is available:', cuda_info);
} else {
console.log('CUDA is not available on this system');
}
} else {
message.error(res.data.message || 'Failed to check CUDA availability');
}
})
.catch(err => {
console.error('CUDA availability check failed', err);
message.error('CUDA availability check failed');
});
}, []);
// Start polling training progress
const startPolling = () => {
if (pollingStopRef.current) {
@ -280,39 +304,23 @@ export default function TrainingPage() {
const eventSource = new EventSource('/api/trainprocess/logs');
eventSource.onmessage = (event) => {
try {
const data = JSON.parse(event.data);
// Don't try to parse as JSON, just use the raw text data directly
const logMessage = event.data;
setTrainingDetails((prev) => {
const newLogs = [
...prev.slice(-500), // Keep more log entries (500 instead of 100)
{
message: logMessage,
timestamp: new Date().toLocaleTimeString([], { hour: '2-digit', minute: '2-digit', second: '2-digit' })
}
];
setTrainingDetails((prev) => {
const newLogs = [
...prev.slice(-100),
{
message: data.message,
timestamp: new Date().toISOString()
}
];
// Save logs to localStorage for persistence between page refreshes
localStorage.setItem('trainingLogs', JSON.stringify(newLogs));
// Save logs to localStorage
// localStorage.setItem('trainingLogs', JSON.stringify(newLogs));
return newLogs;
});
} catch {
setTrainingDetails((prev) => {
const newLogs = [
...prev.slice(-100),
{
message: event.data,
timestamp: new Date().toISOString()
}
];
// Save logs to localStorage
// localStorage.setItem('trainingLogs', JSON.stringify(newLogs));
return newLogs;
});
}
return newLogs;
});
};
eventSource.onerror = (error) => {
@ -535,6 +543,7 @@ export default function TrainingPage() {
trainActionLoading={trainActionLoading}
trainingParams={trainingParams}
updateTrainingParams={updateTrainingParams}
cudaAvailable={cudaAvailable}
/>
{/* Only show training progress after training starts */}

View File

@ -26,6 +26,14 @@ interface ModelConfig {
[key: string]: any;
}
interface TrainingParams {
data_synthesis_mode: string;
learning_rate: number;
number_of_epochs: number;
concurrency_threads: number;
use_cuda: boolean;
}
interface TrainingConfigurationProps {
baseModelOptions: BaseModelOption[];
modelConfig: ModelConfig | null;
@ -40,6 +48,7 @@ interface TrainingConfigurationProps {
trainActionLoading: boolean;
setSelectedInfo: React.Dispatch<React.SetStateAction<boolean>>;
trainingParams: TrainingConfig;
cudaAvailable: boolean;
}
const synthesisModeOptions = [
@ -61,7 +70,8 @@ const TrainingConfiguration: React.FC<TrainingConfigurationProps> = ({
changeBaseModel,
trainActionLoading,
handleTrainingAction,
setSelectedInfo
setSelectedInfo,
cudaAvailable
}) => {
const [disabledChangeParams, setDisabledChangeParams] = useState<boolean>(false);
const [openThinkingModel, setOpenThinkingModel] = useState<boolean>(false);
@ -394,6 +404,47 @@ const TrainingConfiguration: React.FC<TrainingConfigurationProps> = ({
Enter an integer between 1 and 10 (recommended: 2)
</div>
</div>
<div className="flex flex-col gap-2 mt-4">
<div className="flex gap-3 items-center">
<div className="font-medium">Enable CUDA GPU Acceleration</div>
<Tooltip title="When enabled, training will use CUDA GPU acceleration if available on your system. This can significantly speed up training but requires compatible NVIDIA hardware and drivers.">
<QuestionCircleOutlined className="cursor-pointer" />
</Tooltip>
</div>
<div className="flex items-center">
<label className="inline-flex items-center cursor-pointer">
<input
type="checkbox"
checked={
disabledChangeParams && nowTrainingParams && !changeBaseModel
? nowTrainingParams.use_cuda
: trainingParams.use_cuda
}
className="sr-only peer"
disabled={disabledChangeParams || !cudaAvailable}
onChange={(e) => {
updateTrainingParams({ ...trainingParams, use_cuda: e.target.checked });
}}
/>
<div className={`relative w-11 h-6 ${!cudaAvailable ? 'bg-gray-300' : 'bg-gray-200'} peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-blue-300 rounded-full peer peer-checked:after:translate-x-full rtl:peer-checked:after:-translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:start-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all ${cudaAvailable ? 'peer-checked:bg-blue-600' : 'peer-checked:bg-gray-400'}`}></div>
<span className={`ms-3 text-sm font-medium ${!cudaAvailable ? 'text-gray-500' : 'text-gray-700'}`}>
{disabledChangeParams && nowTrainingParams && !changeBaseModel
? nowTrainingParams.use_cuda
? 'Enabled'
: 'Disabled'
: trainingParams.use_cuda
? 'Enabled'
: 'Disabled'}
</span>
</label>
</div>
<div className="text-xs text-gray-500">
{cudaAvailable
? 'Enable for faster training on NVIDIA GPUs.'
: 'CUDA acceleration is not available on this system.'}
</div>
</div>
</div>
</div>

View File

@ -11,6 +11,7 @@ const TrainingLog: React.FC<TrainingLogProps> = ({ trainingDetails }: TrainingLo
const consoleEndRef = useRef<HTMLDivElement>(null);
const [isUserScrolling, setIsUserScrolling] = useState(false);
const userScrollTimeout = useRef<NodeJS.Timeout | null>(null);
const [isAutoScrollEnabled, setIsAutoScrollEnabled] = useState(true);
// Smooth scroll console to bottom
const smoothScrollConsole = () => {
@ -29,17 +30,37 @@ const TrainingLog: React.FC<TrainingLogProps> = ({ trainingDetails }: TrainingLo
useEffect(() => {
// Set up scroll event listener to detect user scrolling
const handleUserScroll = () => {
setIsUserScrolling(true);
if (!consoleEndRef.current) return;
const consoleContainer = consoleEndRef.current.closest('.overflow-y-auto');
if (!(consoleContainer instanceof HTMLElement)) return;
// Check if scrolled away from bottom
const isScrolledToBottom =
Math.abs((consoleContainer.scrollHeight - consoleContainer.scrollTop) - consoleContainer.clientHeight) < 50;
// If scrolled away from bottom, consider it manual scrolling
if (!isScrolledToBottom) {
setIsUserScrolling(true);
// Clear any existing timeout
if (userScrollTimeout.current) {
clearTimeout(userScrollTimeout.current);
}
// Clear any existing timeout
if (userScrollTimeout.current) {
clearTimeout(userScrollTimeout.current);
}
// Reset the flag after a short delay
userScrollTimeout.current = setTimeout(() => {
// Reset the flag after a delay
userScrollTimeout.current = setTimeout(() => {
setIsUserScrolling(false);
}, 5000); // 5 seconds delay before allowing auto-scroll again
} else {
// If at bottom, not considered manual scrolling
setIsUserScrolling(false);
}, 2000); // 2 seconds delay before allowing auto-scroll again
if (userScrollTimeout.current) {
clearTimeout(userScrollTimeout.current);
userScrollTimeout.current = null;
}
}
};
// Find the console container and attach the scroll listener
@ -65,7 +86,16 @@ const TrainingLog: React.FC<TrainingLogProps> = ({ trainingDetails }: TrainingLo
if (trainingDetails.length > 0) {
smoothScrollConsole();
}
}, [trainingDetails]);
}, [trainingDetails, isAutoScrollEnabled]);
const toggleAutoScroll = () => {
setIsAutoScrollEnabled(!isAutoScrollEnabled);
if (!isAutoScrollEnabled) {
// If we're re-enabling auto-scroll, scroll to bottom immediately
setIsUserScrolling(false);
setTimeout(smoothScrollConsole, 50);
}
};
return (
<div className="mt-4">

View File

@ -138,3 +138,17 @@ export const getTrainingParams = () => {
url: `/api/trainprocess/training_params`
});
};
export const checkCudaAvailability = () => {
return Request<CommonResponse<{
cuda_available: boolean;
cuda_info: {
device_count?: number;
current_device?: number;
device_name?: string;
};
}>>({
method: 'get',
url: '/api/kernel2/cuda/available'
});
};

View File

@ -169,6 +169,18 @@ Domain Timelines:
content = response.choices[0].message.content
shift_pattern = r"\{.*\}"
shift_perspective_result = self.__parse_json_response(content, shift_pattern)
# Check if result is None and provide default values to avoid TypeError
if shift_perspective_result is None:
logger.warning(f"Failed to parse perspective shift result, using default values: {content}")
# Create a default mapping with expected parameters
shift_perspective_result = {
"domainDesc": f"You have knowledge and experience related to {shade_info.name}.",
"domainContent": shade_info.content_third_view,
"domainTimeline": []
}
# Now it's safe to pass shift_perspective_result as kwargs
shade_info.add_second_view(**shift_perspective_result)
return shade_info

View File

@ -53,8 +53,18 @@ class PreferenceQAGenerator:
bio: Biography or context information to use in prompt generation.
preference_language: Language for prompts ("Chinese/中文" or otherwise English).
"""
# Ensure the filename is actually a string
if filename is None:
raise ValueError("Filename cannot be None")
self.filename = filename
self.is_cot = is_cot
# Convert is_cot to bool if it's a string
if isinstance(is_cot, str):
self.is_cot = is_cot.lower() == 'true'
else:
self.is_cot = bool(is_cot)
logger.info(f"PreferenceQAGenerator initialized with is_cot={self.is_cot}")
with open(self.filename, "r", encoding="utf-8") as f:
self.pre_msg = json.load(f)

View File

@ -1 +1,12 @@
python lpm_kernel/L2/utils.py Qwen2.5-0.5B-Instruct
#!/bin/bash
# Script to download model from Hugging Face
# Usage: ./download_model.sh [model_name]
# If no model name is provided, will attempt to get it from config
if [ "$1" != "" ]; then
# Use provided model name
python lpm_kernel/L2/utils.py "$1"
else
# No model name provided, let utils.py determine from config
python lpm_kernel/L2/utils.py
fi

View File

@ -35,11 +35,19 @@ class L2Generator:
Args:
data_path: Path to the raw data directory. Defaults to "../raw_data".
preferred_lang: Preferred language for data processing. Defaults to "English".
is_cot: Whether to use Chain of Thought reasoning. Can be bool or string.
"""
self.data_path = data_path
self.data_processor = L2DataProcessor(data_path, preferred_lang)
self.preferred_lang = preferred_lang
self.is_cot = is_cot
# Convert is_cot to bool if it's a string
if isinstance(is_cot, str):
self.is_cot = is_cot.lower() == 'true'
else:
self.is_cot = bool(is_cot)
logging.info(f"L2Generator initialized with is_cot={self.is_cot}")
def data_preprocess(self, note_list: List[Note], basic_info: Dict):
"""Preprocess the input notes and basic information.
@ -60,39 +68,50 @@ class L2Generator:
graph_path: str,
config_path: str,
):
"""Generate subjective data based on input notes and user information.
"""Generate subjective data for personalization.
This method orchestrates the generation of subjective data including preferences,
diversity, self-Q&A data, and graph indexing.
Args:
note_list: List of Note objects.
basic_info: Dictionary containing basic user information.
note_list: List of Note objects to process.
basic_info: Dictionary containing user information.
data_output_base_dir: Base directory for output data.
topics_path: Path to topics data.
entities_path: Path to entities data.
entities_path: Path to entity data.
graph_path: Path to graph data.
config_path: Path to configuration file.
"""
global_bio = basic_info["globalBio"]
user_name = basic_info["username"]
user_intro = basic_info["aboutMe"]
if not os.path.exists(data_output_base_dir):
os.makedirs(data_output_base_dir)
preference_output_path = "preference.json"
diversity_output_path = "diversity.json"
selfqa_output_path = "selfqa.json"
# Check if the file exists
if not os.path.exists(topics_path):
# Create an empty file
with open(topics_path, "w") as f:
f.write(json.dumps([]))
# Generate subjective data
self.data_processor.gen_subjective_data(
note_list,
data_output_base_dir,
preference_output_path,
diversity_output_path,
selfqa_output_path,
global_bio,
topics_path,
entities_path,
graph_path,
user_name,
config_path,
user_intro,
note_list=note_list,
data_output_base_dir=data_output_base_dir,
preference_output_path="preference.json",
diversity_output_path="diversity.json",
selfqa_output_path="selfqa.json",
global_bio=basic_info["globalBio"],
topics_path=topics_path,
entitys_path=entities_path,
graph_path=graph_path,
user_name=basic_info["username"],
config_path=config_path,
user_intro=basic_info["aboutMe"],
)
# Merge JSON files for training
self.merge_json_files(data_output_base_dir)
# Release Ollama models from memory after data synthesis is complete
self._release_ollama_models()
def gen_preference_data(
self,
@ -187,6 +206,20 @@ class L2Generator:
with open(merged_output_path, 'w', encoding='utf-8') as f:
json.dump(merged_data, f, ensure_ascii=False, indent=2)
def _release_ollama_models(self):
"""Release Ollama models from memory to free up VRAM for training.
This method calls the release function defined in the train module.
It's important to release models after data synthesis and before training
to ensure VRAM is properly freed.
"""
try:
from lpm_kernel.L2.train import release_ollama_models
release_ollama_models()
except Exception as e:
import logging
logging = logging.getLogger(__name__)
logging.warning(f"Failed to release Ollama models: {str(e)}")
def clean_graphrag_keys(self):
GRAPH_CONFIG = os.path.join(

View File

@ -0,0 +1,149 @@
"""Memory management utilities for PyTorch training.
This module provides lightweight utilities to monitor memory usage
and configure PyTorch's built-in memory management features.
"""
import os
import gc
import logging
import psutil
import torch
from typing import Dict, Any
# Configure logging
logger = logging.getLogger(__name__)
class MemoryManager:
"""Simple memory manager that leverages PyTorch's built-in memory optimizations."""
def __init__(self):
"""Initialize the memory manager."""
self.cuda_available = torch.cuda.is_available()
self.process = psutil.Process(os.getpid())
# Remove redundant environment variable setting - now handled in train_for_user.sh
def get_memory_info(self) -> Dict[str, Any]:
"""Get current memory usage information."""
info = {
"ram_used_percent": psutil.virtual_memory().percent,
"ram_used_gb": psutil.virtual_memory().used / (1024**3),
"ram_available_gb": psutil.virtual_memory().available / (1024**3),
"ram_total_gb": psutil.virtual_memory().total / (1024**3),
}
if self.cuda_available:
try:
info.update({
"vram_used_gb": torch.cuda.memory_allocated() / (1024**3),
"vram_reserved_gb": torch.cuda.memory_reserved() / (1024**3),
"vram_total_gb": torch.cuda.get_device_properties(0).total_memory / (1024**3),
})
except RuntimeError as e:
logger.warning(f"Error getting CUDA memory info: {str(e)}")
self.cuda_available = False
return info
def cleanup_memory(self, force: bool = False) -> None:
"""Free up memory by garbage collection and emptying CUDA cache."""
# Run Python garbage collection
gc.collect()
# Empty CUDA cache if available
if self.cuda_available:
torch.cuda.empty_cache()
# Log memory status after cleanup
if force:
info = self.get_memory_info()
logger.info(
f"Memory after cleanup: RAM: {info['ram_used_gb']:.2f}GB / {info['ram_total_gb']:.2f}GB, "
f"VRAM: {info.get('vram_used_gb', 0):.2f}GB / {info.get('vram_total_gb', 0):.2f}GB"
)
def get_optimal_training_config(self) -> Dict[str, Any]:
"""Get recommended configurations for model training based on hardware capabilities."""
# Default configs that rely on PyTorch's automatic memory management
config = {
"device_map": "auto",
"fp16": False,
"bf16": False,
"gradient_checkpointing": True,
"gradient_accumulation_steps": 1,
}
# Enable mixed precision based on hardware support
if self.cuda_available:
capability = torch.cuda.get_device_capability()
if capability[0] >= 8: # Ampere or newer (supports BF16)
config["bf16"] = True
elif capability[0] >= 7: # Volta or newer (supports FP16)
config["fp16"] = True
# Adjust accumulation steps based on available memory
vram_gb = self.get_memory_info().get("vram_total_gb", 0)
if vram_gb < 8: # Small GPUs
config["gradient_accumulation_steps"] = 4
elif vram_gb < 16: # Medium GPUs
config["gradient_accumulation_steps"] = 2
return config
def optimize_model_for_training(self, model):
"""Apply PyTorch's built-in memory optimizations for training."""
# Enable gradient checkpointing if available
if hasattr(model, "gradient_checkpointing_enable"):
logger.info("Enabling gradient checkpointing for memory efficiency")
model.gradient_checkpointing_enable()
# Enable memory-efficient attention for PyTorch 2.0+
if hasattr(model, "config"):
try:
model.config.use_memory_efficient_attention = True
except:
pass
# Enable flash attention for compatible GPUs
if self.cuda_available and torch.cuda.get_device_capability()[0] >= 8:
try:
model.config.attn_implementation = "flash_attention_2"
except:
pass
return model
def optimize_training_args(self, training_args):
"""Configure training arguments for efficient memory usage."""
if not training_args:
return None
# Get optimal configuration based on hardware
config = self.get_optimal_training_config()
# Apply configurations to training arguments
if not getattr(training_args, "fp16", False) and not getattr(training_args, "bf16", False):
training_args.fp16 = config["fp16"]
training_args.bf16 = config["bf16"]
if not getattr(training_args, "gradient_checkpointing", False):
training_args.gradient_checkpointing = config["gradient_checkpointing"]
if training_args.gradient_accumulation_steps == 1:
training_args.gradient_accumulation_steps = config["gradient_accumulation_steps"]
logger.info("Training configuration optimized for memory efficiency:")
logger.info(f" Mixed precision: FP16={training_args.fp16}, BF16={training_args.bf16}")
logger.info(f" Gradient checkpointing: {training_args.gradient_checkpointing}")
logger.info(f" Gradient accumulation steps: {training_args.gradient_accumulation_steps}")
return training_args
# Global memory manager instance
memory_manager = MemoryManager()
def get_memory_manager() -> MemoryManager:
"""Get the global memory manager instance."""
return memory_manager

View File

@ -6,36 +6,123 @@ LoRA architecture during inference.
"""
import argparse
import os
import gc
import sys
import logging
import traceback
import torch
import datetime
from typing import Optional, Dict, Any
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
from lpm_kernel.L2.memory_manager import get_memory_manager
# Configure logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
def merge_lora_weights(base_model_path, lora_adapter_path, output_model_path):
"""Merge LoRA weights into a base model and save the result.
This function loads a base model and a LoRA adapter, merges them together,
and saves the resulting model to the specified output path.
and saves the resulting model to the specified output path. It leverages
PyTorch's built-in memory management features.
Args:
base_model_path: Path to the base model directory.
lora_adapter_path: Path to the LoRA adapter directory.
output_model_path: Path where the merged model will be saved.
"""
# Load the base model
logging.info(f"Loading base model from {base_model_path}")
base_model = AutoModelForCausalLM.from_pretrained(base_model_path)
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
# Load the LoRA adapter and apply it to the base model
lora_model = PeftModel.from_pretrained(base_model, lora_adapter_path)
# Merge LoRA weights into the base model
merged_model = lora_model.merge_and_unload()
# Save the merged model and tokenizer
merged_model.save_pretrained(output_model_path)
tokenizer.save_pretrained(output_model_path)
# Get memory manager
memory_manager = get_memory_manager()
try:
# Log initial memory state
memory_info = memory_manager.get_memory_info()
logger.info(f"Initial memory state: RAM used: {memory_info['ram_used_gb']:.2f}GB, "
f"available: {memory_info['ram_available_gb']:.2f}GB")
# Determine if CUDA is available and should be used
use_cuda = memory_manager.cuda_available
device = "cuda" if use_cuda else "cpu"
if use_cuda:
logger.info(f"CUDA is available. VRAM used: {memory_info.get('vram_used_gb', 0):.2f}GB")
else:
logger.warning("CUDA not available or not enabled. Using CPU for model operations.")
# Clean up memory before starting
memory_manager.cleanup_memory(force=True)
# Explicitly set device configuration based on available hardware
device_map = "auto" if use_cuda else None
dtype = torch.float16 if use_cuda else torch.float32
logger.info(f"Loading base model from {base_model_path} with device_map={device_map}, dtype={dtype}")
# Use explicit configuration for GPU utilization
base_model = AutoModelForCausalLM.from_pretrained(
base_model_path,
torch_dtype=dtype,
device_map=device_map
)
# Load tokenizer - this doesn't consume much memory
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
# Load the LoRA adapter and apply it to the base model
logger.info(f"Loading LoRA adapter from {lora_adapter_path}")
lora_model = PeftModel.from_pretrained(base_model, lora_adapter_path)
# Merge weights - this is done automatically by PyTorch on appropriate devices
logger.info(f"Merging LoRA weights into base model on {device}")
merged_model = lora_model.merge_and_unload()
# Clean up before saving
memory_manager.cleanup_memory()
# Add inference optimization config to the merged model for faster startup
if use_cuda:
# Set inference-specific configuration in model config
if hasattr(merged_model.config, "torch_dtype"):
merged_model.config.torch_dtype = "float16" # Prefer float16 for inference
if not hasattr(merged_model.config, "pretraining_tp"):
merged_model.config.pretraining_tp = 1 # For tensor parallelism during inference
# Set default inference device
if not hasattr(merged_model.config, "_default_inference_device"):
merged_model.config._default_inference_device = "cuda:0"
logger.info("Added GPU optimization settings to model configuration")
# Save merged model with shard size to prevent OOM errors during save
logger.info(f"Saving merged model to {output_model_path}")
merged_model.save_pretrained(
output_model_path,
safe_serialization=True,
max_shard_size="2GB" # Sharded saving to avoid memory spikes
)
tokenizer.save_pretrained(output_model_path)
# Save a special marker file to indicate this model should use GPU for inference
if use_cuda:
with open(os.path.join(output_model_path, "gpu_optimized.json"), "w") as f:
import json
json.dump({"gpu_optimized": True, "optimized_on": datetime.datetime.now().isoformat()}, f)
logger.info("Added GPU optimization marker file for faster service startup")
logger.info("Model successfully merged and saved!")
except Exception as e:
logger.error(f"Error during model merge: {str(e)}")
logger.error(traceback.format_exc())
# Force cleanup
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
raise
def merge_model_weights(

View File

@ -28,10 +28,13 @@ from lpm_kernel.L2.utils import (
create_and_prepare_model,
formatting_prompts_func,
create_chat_data,
release_ollama_models_early,
)
from lpm_kernel.configs.logging import LOGGING_CONFIG
import logging.config
from lpm_kernel.configs.logging import get_train_process_logger
from lpm_kernel.L2.memory_manager import get_memory_manager
logger = get_train_process_logger()
@ -45,6 +48,25 @@ class LogTqdm(tqdm):
# Replace the default tqdm
sys.modules["tqdm"].tqdm = LogTqdm
# Debug callback for logging training progress
class DebugCallback(transformers.TrainerCallback):
def __init__(self):
self.total_time = 0
self.last_time = time.time()
def on_step_end(self, args, state, control, **kwargs):
if state.global_step % 10 == 0:
current_time = time.time()
step_time = current_time - self.last_time
self.total_time += step_time
self.last_time = current_time
# Log step time and training progress
logger.info(f"Step {state.global_step}: {step_time:.2f}s - Total training time: {self.total_time:.2f}s")
def on_epoch_end(self, args, state, control, **kwargs):
logger.info(f"Epoch {state.epoch} completed")
@dataclass
class ModelArguments:
@ -112,6 +134,10 @@ class ModelArguments:
default=False,
metadata={"help": "Enables UnSloth for training."},
)
use_cuda: Optional[bool] = field(
default=False,
metadata={"help": "Enables CUDA GPU acceleration for training and inference when available."},
)
@dataclass
@ -162,80 +188,117 @@ def main(model_args, data_args, training_args):
for handler in logging.getLogger().handlers:
handler.flush()
logger.info("start 1")
set_seed(training_args.seed)
logger.info("start 2")
# model
model, peft_config, tokenizer = create_and_prepare_model(
model_args, data_args, training_args
)
logger.info("start 3")
# gradient ckpt
model.config.use_cache = not training_args.gradient_checkpointing
training_args.gradient_checkpointing = (
training_args.gradient_checkpointing and not model_args.use_unsloth
)
logger.info("start 4")
if training_args.gradient_checkpointing:
training_args.gradient_checkpointing_kwargs = {
"use_reentrant": model_args.use_reentrant
}
# Configure system resources for optimal performance
def configure_system_resources(num_cores=None):
"""
Configure system resources to optimize training performance
Args:
num_cores: Number of CPU cores to use, if None, automatically detect
"""
# Automatically detect available cores, if not specified
if num_cores is None:
num_cores = min(os.cpu_count(), 6) # Limit to 6 cores, match Docker configuration
logger.info(f"Configuring system to use {num_cores} CPU cores")
# Set environment variables
os.environ["OMP_NUM_THREADS"] = str(num_cores)
os.environ["MKL_NUM_THREADS"] = str(num_cores)
os.environ["NUMEXPR_NUM_THREADS"] = str(num_cores)
# Set PyTorch thread count
torch.set_num_threads(num_cores)
# If supported, set PyTorch multi-thread optimization
if hasattr(torch, "set_num_interop_threads"):
torch.set_num_interop_threads(num_cores)
# Enable memory-optimized garbage collection
# import gc
# gc.enable()
# # Monitor memory usage and clean up periodically
# def schedule_gc():
# gc.collect()
# torch.cuda.empty_cache() if torch.cuda.is_available() else None
# return schedule_gc
# If CUDA is available, set CUDA device
if torch.cuda.is_available():
torch.cuda.set_device(0)
logger.info(f"CUDA is available. Using device: {torch.cuda.get_device_name(0)}")
# Display CUDA memory information
logger.info(f"CUDA memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
logger.info(f"CUDA memory reserved: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")
# Get memory manager for optimization
memory_manager = get_memory_manager()
memory_manager.cleanup_memory(force=True)
# Call function to configure system resources
configure_system_resources()
# Release Ollama models if they exist to free up VRAM
if torch.cuda.is_available() and model_args.use_cuda:
release_ollama_models_early()
logger.info("Initializing training with memory optimizations")
set_seed(training_args.seed)
# Apply PyTorch memory optimizations to training arguments
logger.info("Applying memory optimizations to training configuration")
training_args = memory_manager.optimize_training_args(training_args)
# --- Accelerate optimizer state offloading logic ---
# Enable optimizer state offload to CPU if VRAM is low and not using DeepSpeed
vram_total = memory_manager.get_memory_info().get("vram_total_gb", 0)
use_accelerate_offload = False
if torch.cuda.is_available() and model_args.use_cuda and vram_total > 0 and vram_total < 16:
# Only set if not already using DeepSpeed
if not hasattr(training_args, "deepspeed") or training_args.deepspeed is None:
logger.info("Enabling Hugging Face Accelerate optimizer state offload to CPU for low VRAM GPUs")
accelerate_config = {
"compute_environment": "LOCAL_MACHINE",
"deepspeed_config": None,
"distributed_type": "NO",
"downcast_bf16": False,
"fsdp_config": {},
"main_training_function": "main",
"mixed_precision": "no",
"num_machines": 1,
"num_processes": 1,
"use_cpu": False,
"zero3_init_flag": False,
"offload_optimizer_device": "cpu",
"offload_param_device": "none"
}
training_args.accelerate_config = accelerate_config
use_accelerate_offload = True
# Model loading with device_map="auto" for automatic offloading
logger.info(f"Loading model with automatic memory management from {model_args.model_name_or_path}")
# Create model arguments dict with automatic offloading
model_kwargs = {
# Don't use "auto" device_map initially to avoid meta tensor issues
"device_map": None,
"trust_remote_code": True
}
# Configure quantization if requested
if model_args.use_4bit_quantization:
from transformers import BitsAndBytesConfig
compute_dtype = getattr(torch, model_args.bnb_4bit_compute_dtype)
quant_storage_dtype = getattr(torch, model_args.bnb_4bit_quant_storage_dtype)
model_kwargs["quantization_config"] = BitsAndBytesConfig(
load_in_4bit=model_args.use_4bit_quantization,
bnb_4bit_quant_type=model_args.bnb_4bit_quant_type,
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=model_args.use_nested_quant,
bnb_4bit_quant_storage=quant_storage_dtype,
)
# For 4-bit models, we can use device_map="auto"
model_kwargs["device_map"] = "auto"
logger.info("Using 4-bit quantization for memory efficiency")
elif model_args.use_8bit_quantization:
from transformers import BitsAndBytesConfig
model_kwargs["quantization_config"] = BitsAndBytesConfig(
load_in_8bit=model_args.use_8bit_quantization
)
# For 8-bit models, we can use device_map="auto"
model_kwargs["device_map"] = "auto"
logger.info("Using 8-bit quantization for memory efficiency")
# Flash attention for memory efficiency when supported
if model_args.use_flash_attn and torch.cuda.is_available() and model_args.use_cuda:
model_kwargs["attn_implementation"] = "flash_attention_2"
logger.info("Using Flash Attention 2 for memory efficiency")
# Load model with built-in memory management features
model, peft_config, tokenizer = create_and_prepare_model(
model_args, data_args, training_args, model_kwargs=model_kwargs
)
# If model has meta tensors, handle them properly
if hasattr(model, "is_meta") and model.is_meta:
logger.info("Model has meta tensors, using to_empty() to properly initialize")
device = "cuda" if torch.cuda.is_available() and model_args.use_cuda else "cpu"
model = model.to_empty(device=device)
# Apply gradient checkpointing for memory efficiency
if training_args.gradient_checkpointing and hasattr(model, "gradient_checkpointing_enable"):
logger.info("Enabling gradient checkpointing for memory efficiency")
model.gradient_checkpointing_enable()
model.config.use_cache = False
# Allow only one full forward/backward pass at a time (if needed for memory)
if torch.cuda.is_available() and memory_manager.get_memory_info().get("vram_total_gb", 0) < 8:
torch.cuda.set_per_process_memory_fraction(0.9)
logger.info("Setting memory fraction limit to avoid OOM errors")
# datasets
train_dataset = create_chat_data(
data_args,
tokenizer,
)
response_template = "\n<|im_start|>assistant\n"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
training_args.dataset_kwargs = {
@ -243,6 +306,45 @@ def main(model_args, data_args, training_args):
"add_special_tokens": data_args.add_special_tokens,
}
# Use DeepSpeed to handle meta tensors if available
try:
# Only configure DeepSpeed if meta tensors are present and DeepSpeed is available
if hasattr(model, "is_meta") and model.is_meta:
logger.info("Model has meta tensors, checking DeepSpeed availability")
# First verify DeepSpeed is properly installed and importable
try:
import deepspeed
logger.info("DeepSpeed is available, configuring for meta tensor handling")
# Configure with appropriate settings for meta tensors
training_args.deepspeed = {
"zero_stage": 3,
"offload_optimizer": {
"device": "cpu"
},
"offload_param": {
"device": "cpu"
},
"zero3_init_flag": True,
"zero_force_ds_cpu_optimizer": False
}
logger.info("DeepSpeed configured for meta tensor handling")
except ImportError:
logger.warning("DeepSpeed is not available, meta tensors will be handled differently")
# If DeepSpeed isn't available, use alternative approach to handle meta tensors
if torch.cuda.is_available() and model_args.use_cuda:
logger.info("Initializing meta tensors on GPU")
# Use device_map instead of DeepSpeed for meta tensor initialization
from accelerate import init_empty_weights
with init_empty_weights():
model.to_empty(device="cuda")
else:
logger.info("Initializing meta tensors on CPU")
model.to_empty(device="cpu")
except Exception as e:
logger.warning(f"Could not configure meta tensor handling: {e}")
logger.warning(traceback.format_exc())
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
@ -252,145 +354,46 @@ def main(model_args, data_args, training_args):
formatting_func=formatting_prompts_func,
data_collator=collator,
)
# Print model details
trainer.accelerator.print(f"{trainer.model}")
trainer.model.print_trainable_parameters()
logger.info("start 6")
# train
checkpoint = None
if training_args.resume_from_checkpoint is not None:
logger.info("start 6.1")
checkpoint = training_args.resume_from_checkpoint
logger.info("start 6.2")
class DebugCallback(transformers.TrainerCallback):
"""
Debug callback to monitor training process
"""
if hasattr(trainer.model, "print_trainable_parameters"):
trainer.model.print_trainable_parameters()
# Memory usage tracking callback
class MemoryMonitorCallback(transformers.TrainerCallback):
def __init__(self):
self.step_times = {}
self.current_step_start = None
def on_train_begin(self, args, state, control, **kwargs):
logger.info("=== Training Begin ===")
logger.info("Checking initial conditions:")
trainer = kwargs.get("trainer")
if trainer:
# Check model status
logger.info(f"Model device: {trainer.model.device}")
logger.info(f"Model dtype: {next(trainer.model.parameters()).dtype}")
# Check data loader
if hasattr(trainer, "train_dataset"):
logger.info(f"Training dataset size: {len(trainer.train_dataset)}")
# Check optimizer
if hasattr(trainer, "optimizer"):
logger.info("Optimizer configuration:")
for i, group in enumerate(trainer.optimizer.param_groups):
logger.info(
f"Group {i}: lr={group['lr']}, weight_decay={group['weight_decay']}"
)
def on_step_begin(self, args, state, control, **kwargs):
self.current_step_start = time.time()
logger.info(f"\n=== Starting Step {state.global_step + 1} ===")
# Check system status every 10 steps
if state.global_step % 10 == 0:
process = psutil.Process()
with process.oneshot():
logger.info(f"CPU Usage: {process.cpu_percent()}%")
logger.info(
f"Memory Usage: {process.memory_info().rss / 1024**2:.2f}MB"
)
logger.info(f"Thread Count: {process.num_threads()}")
self.memory_manager = get_memory_manager()
def on_step_end(self, args, state, control, **kwargs):
if self.current_step_start:
step_time = time.time() - self.current_step_start
self.step_times[state.global_step] = step_time
avg_time = sum(self.step_times.values()) / len(self.step_times)
logger.info(
f"Step {state.global_step + 1} completed in {step_time:.2f}s (avg: {avg_time:.2f}s)"
)
# Check if step time is much longer than average
if step_time > avg_time * 2 and len(self.step_times) > 1:
logger.warning(
f"Step {state.global_step + 1} took {step_time:.2f}s, which is much longer than average!"
)
trainer = kwargs.get("trainer")
if trainer and hasattr(trainer, "optimizer"):
# Check gradient status
grad_norms = []
for name, param in trainer.model.named_parameters():
if param.grad is not None:
grad_norms.append(param.grad.norm().item())
if grad_norms:
avg_grad_norm = sum(grad_norms) / len(grad_norms)
logger.info(f"Average gradient norm: {avg_grad_norm:.5f}")
else:
logger.warning("No gradients found in this step!")
def on_log(self, args, state, control, logs=None, **kwargs):
if logs:
logger.info(f"=== Logs for Step {state.global_step} ===")
for key, value in logs.items():
logger.info(f"{key}: {value}")
def on_train_end(self, args, state, control, **kwargs):
logger.info("=== Training Ended ===")
logger.info(f"Total steps completed: {state.global_step}")
if self.step_times:
avg_time = sum(self.step_times.values()) / len(self.step_times)
logger.info(f"Average step time: {avg_time:.2f}s")
# Check memory every 5 steps
if state.global_step % 5 == 0 and torch.cuda.is_available():
info = self.memory_manager.get_memory_info()
vram_usage_pct = info.get("vram_used_gb", 0) / info.get("vram_total_gb", 1) * 100
if vram_usage_pct > 90:
logger.info(f"VRAM usage high ({vram_usage_pct:.1f}%), cleaning cache")
self.memory_manager.cleanup_memory()
def on_save(self, args, state, control, **kwargs):
# Free up memory before saving
self.memory_manager.cleanup_memory(force=True)
# Add memory monitoring
trainer.add_callback(MemoryMonitorCallback())
# Add existing debug callback
trainer.add_callback(DebugCallback())
# Add more detailed logs
logger.info("Starting training preparation...")
# Resume from checkpoint if specified
checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
# Training with automatic memory management
try:
logger.info("Initializing training process...")
# Check model loading and structure
logger.info("Analyzing model structure...")
model = trainer.model
def print_model_structure(model, prefix=""):
logger.info(f"{prefix}Model class: {model.__class__.__name__}")
for name, child in model.named_children():
logger.info(f"{prefix}Child: {name} ({child.__class__.__name__})")
if len(list(child.named_children())) > 0:
print_model_structure(child, prefix + " ")
# print_model_structure(model)
# Check model size
total_params = sum(p.numel() for p in trainer.model.parameters())
trainable_params = sum(
p.numel() for p in trainer.model.parameters() if p.requires_grad
)
logger.info(f"Total parameters: {total_params:,}")
logger.info(f"Trainable parameters: {trainable_params:,}")
# Check optimizer settings
logger.info("Checking optimizer settings...")
# Check data loader
train_dataloader = trainer.get_train_dataloader()
logger.info(f"Train dataloader created with {len(train_dataloader)} batches")
process = psutil.Process()
memory_info = process.memory_info()
logger.info(f"Memory usage details:")
logger.info(f"RSS (Resident Set Size): {memory_info.rss / 1024**2:.2f}MB")
logger.info(f"VMS (Virtual Memory Size): {memory_info.vms / 1024**2:.2f}MB")
# Start training
logger.info("Starting actual training process...")
logger.info("Starting training with memory-optimized configuration")
trainer.train(resume_from_checkpoint=checkpoint)
except Exception as e:
logger.error(f"Error during training: {str(e)}")
@ -398,9 +401,13 @@ def main(model_args, data_args, training_args):
logger.error(f"Traceback: {traceback.format_exc()}")
raise
logger.info("start 7")
# Save the model
if trainer.is_fsdp_enabled:
trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
# Clean up before saving
memory_manager.cleanup_memory(force=True)
trainer.save_model()
logger.info("Training completed successfully")

54
lpm_kernel/L2/train_for_user.sh Executable file → Normal file
View File

@ -6,6 +6,7 @@ NUM_TRAIN_EPOCHS="3"
CONCURRENCY_THREADS="2"
DATA_SYNTHESIS_MODE="low"
HALF=False
USE_CUDA=False # Default to False, will be overridden by parameter
IS_COT=False
# Process parameters
@ -15,33 +16,75 @@ while [[ "$#" -gt 0 ]]; do
--epochs) NUM_TRAIN_EPOCHS="$2"; shift ;;
--threads) CONCURRENCY_THREADS="$2"; shift ;;
--mode) DATA_SYNTHESIS_MODE="$2"; shift ;;
--cuda)
# Convert string to lowercase for consistent comparison
cuda_value=$(echo "$2" | tr '[:upper:]' '[:lower:]')
if [[ "$cuda_value" == "true" || "$cuda_value" == "1" || "$cuda_value" == "yes" ]]; then
USE_CUDA=True
echo "CUDA enabled by user configuration."
else
USE_CUDA=False
echo "CUDA disabled by user configuration."
fi
shift ;;
--is_cot) IS_COT="$2"; shift ;;
*) echo "Unknown parameter: $1"; exit 1 ;;
esac
shift
done
# Explicitly log the CUDA setting passed from the command line
echo "CUDA parameter received: $USE_CUDA"
# Verify CUDA availability if enabled
if [[ "$USE_CUDA" == "True" ]]; then
# Set CUDA environment variables to ensure PyTorch detects GPU
export CUDA_VISIBLE_DEVICES=0
echo "CUDA_VISIBLE_DEVICES set to 0"
# Set CUDA_LAUNCH_BLOCKING to 0 for async operations (better performance)
export CUDA_LAUNCH_BLOCKING=0
echo "CUDA_LAUNCH_BLOCKING set to 0 for better performance"
else
# Explicitly disable CUDA
export CUDA_VISIBLE_DEVICES=""
echo "CUDA_VISIBLE_DEVICES explicitly disabled"
fi
# Log the parameters being used
echo "Using training parameters:"
echo " Learning rate: $LEARNING_RATE"
echo " Number of epochs: $NUM_TRAIN_EPOCHS"
echo " Concurrency threads: $CONCURRENCY_THREADS"
echo " Data synthesis mode: $DATA_SYNTHESIS_MODE"
echo " Use CUDA: $USE_CUDA"
echo " Is chain of thought: $IS_COT"
# If concurrency threads are set, configure related environment variables
if [ "$CONCURRENCY_THREADS" != "1" ]; then
# Limit the number of parallel threads to avoid memory issues
export OMP_NUM_THREADS=$CONCURRENCY_THREADS
export MKL_NUM_THREADS=$CONCURRENCY_THREADS
export NUMEXPR_NUM_THREADS=$CONCURRENCY_THREADS
# Add torch-specific threading controls
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
echo "Set thread environment variables to $CONCURRENCY_THREADS"
fi
# Add BF16 option based on the platform
if [ "$PLATFORM" != "apple" ]; then
# Add BF16 option based on the platform and CUDA availability
if [ "$PLATFORM" != "apple" ] && [ "$USE_CUDA" == "True" ]; then
HALF=True
echo "Enabling BF16 half precision for non-Apple platform with CUDA"
else
echo "Using standard precision (not using BF16)"
fi
# Print environment for debugging
echo "Environment configuration:"
echo " CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}"
echo " PYTORCH_CUDA_ALLOC_CONF: ${PYTORCH_CUDA_ALLOC_CONF}"
echo " Using half precision: ${HALF}"
# Execute training script with parameters from environment variables
python lpm_kernel/L2/train.py \
--seed 42 \
@ -70,7 +113,7 @@ python lpm_kernel/L2/train.py \
--per_device_train_batch_size 2 \
--gradient_accumulation_steps $CONCURRENCY_THREADS \
--gradient_checkpointing True \
--use_reentrant True \
--use_reentrant False \
--use_peft_lora True \
--lora_r 8 \
--lora_alpha 16 \
@ -78,6 +121,7 @@ python lpm_kernel/L2/train.py \
--lora_target_modules "all-linear" \
--use_4bit_quantization False \
--use_nested_quant False \
--bnb_4bit_compute_dtype "bfloat16"\
--is_cot $IS_COT
--bnb_4bit_compute_dtype "bfloat16" \
--is_cot $IS_COT \
--use_cuda $USE_CUDA

View File

@ -33,12 +33,171 @@ from lpm_kernel.L2.training_prompt import (
MEMORY_COT_PROMPT,
)
# Add import for memory manager
from .memory_manager import get_memory_manager
import gc
import requests
# Initialize the logger
logger = logging.getLogger(__name__)
# Default chat templates for different model formats
DEFAULT_CHATML_CHAT_TEMPLATE = "{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}"
DEFAULT_ZEPHYR_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
def release_ollama_models_early():
"""Release Ollama models from memory as early as possible before model loading.
This function uses the Ollama API with keep_alive=0 parameter to properly unload models
and free up VRAM before loading the training model.
"""
try:
from lpm_kernel.api.services.user_llm_config_service import UserLLMConfigService
import json
logger.info("Early release of Ollama models to free up VRAM for training")
# Get current user LLM config to identify models to release
user_llm_config_service = UserLLMConfigService()
user_llm_config = user_llm_config_service.get_available_llm()
if not user_llm_config:
logger.warning("No user LLM configuration found. Skipping Ollama model release.")
return
# Track which models have been released
released_models = set()
def get_generate_url(base_endpoint):
"""Helper function to get the API endpoint for unloading models"""
if not base_endpoint:
return None
base_url = base_endpoint.rstrip("/")
# Convert to API base URL if needed (may be v1 format or direct ollama format)
if "/v1/" in base_url:
api_base = base_url.split("/v1/")[0]
return f"{api_base}/api/generate"
else:
# Check if this is a non-localhost Ollama instance
if "ollama" in base_url.lower():
if "localhost" in base_url or "127.0.0.1" in base_url:
return "http://localhost:11434/api/generate"
else:
# Extract the base URL and use it
parts = base_url.split("//")
if len(parts) > 1:
host = parts[1].split("/")[0]
return f"{parts[0]}//{host}/api/generate"
# Default ollama endpoint as fallback
return "http://localhost:11434/api/generate"
# Release chat model if using Ollama
if "ollama" in user_llm_config.chat_endpoint.lower() and user_llm_config.chat_model_name:
chat_model = user_llm_config.chat_model_name
generate_url = get_generate_url(user_llm_config.chat_endpoint)
if not generate_url:
logger.warning(f"Could not determine API endpoint for chat model: {chat_model}")
else:
logger.info(f"Releasing Ollama chat model: {chat_model} via {generate_url}")
try:
# Set up headers with API key if provided
headers = {
"Content-Type": "application/json"
}
if user_llm_config.chat_api_key:
headers["Authorization"] = f"Bearer {user_llm_config.chat_api_key}"
# Use the proper generate endpoint with keep_alive=0 to unload
payload = {
"model": chat_model,
"keep_alive": 0,
"prompt": " " # Minimal prompt needed for request
}
unload_response = requests.post(
generate_url,
headers=headers,
data=json.dumps(payload),
timeout=30 # Add timeout to prevent hanging
)
if unload_response.status_code < 300:
logger.info(f"✅ Successfully unloaded chat model: {chat_model}")
released_models.add(chat_model)
else:
logger.warning(f"Failed to unload model via API: {unload_response.status_code} - {unload_response.text}")
except Exception as e:
logger.warning(f"Failed to release chat model {chat_model}: {str(e)}")
# Release embedding model if different from chat model and using Ollama
if (user_llm_config.embedding_model_name and
"ollama" in user_llm_config.embedding_endpoint.lower() and
user_llm_config.embedding_model_name != user_llm_config.chat_model_name and
user_llm_config.embedding_model_name not in released_models):
embedding_model = user_llm_config.embedding_model_name
generate_url = get_generate_url(user_llm_config.embedding_endpoint)
if not generate_url:
logger.warning(f"Could not determine API endpoint for embedding model: {embedding_model}")
else:
logger.info(f"Releasing Ollama embedding model: {embedding_model} via {generate_url}")
try:
# Set up headers with API key if provided
headers = {
"Content-Type": "application/json"
}
if user_llm_config.embedding_api_key:
headers["Authorization"] = f"Bearer {user_llm_config.embedding_api_key}"
# Use the proper generate endpoint with keep_alive=0 to unload
payload = {
"model": embedding_model,
"keep_alive": 0,
"prompt": " " # Minimal prompt needed for request
}
unload_response = requests.post(
generate_url,
headers=headers,
data=json.dumps(payload),
timeout=30 # Add timeout to prevent hanging
)
if unload_response.status_code < 300:
logger.info(f"✅ Successfully unloaded embedding model: {embedding_model}")
released_models.add(embedding_model)
else:
logger.warning(f"Failed to unload model via API: {unload_response.status_code} - {unload_response.text}")
except Exception as e:
logger.warning(f"Failed to release embedding model {embedding_model}: {str(e)}")
# Final cleanup and verification
if torch.cuda.is_available():
torch.cuda.empty_cache()
memory_info = get_memory_manager().get_memory_info()
vram_used = memory_info.get('vram_used_gb', 0)
vram_total = memory_info.get('vram_total_gb', 1)
logger.info(f"VRAM after early model release: {vram_used:.2f}GB / {vram_total:.2f}GB ({vram_used/vram_total*100:.1f}%)")
if released_models:
logger.info(f"Early release completed for {len(released_models)} Ollama models: {', '.join(released_models)}")
else:
logger.info("No Ollama models were released early")
except Exception as e:
import traceback
logger.error(f"Error in early Ollama model release: {str(e)}")
logger.error(traceback.format_exc())
def count_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
"""Returns the number of tokens in a text string using a specified encoding.
@ -105,21 +264,31 @@ class ZephyrSpecialTokens(str, Enum):
"""Returns a list of all special tokens."""
return [token.value for token in cls]
def create_and_prepare_model(args, data_args, training_args):
def create_and_prepare_model(args, data_args, training_args, model_kwargs=None):
"""Creates and prepares a model for training.
Args:
args: Model arguments containing model configuration.
data_args: Data arguments for training.
training_args: Training configuration arguments.
model_kwargs: Additional kwargs to pass to the model loading function.
Returns:
Tuple of (model, tokenizer) ready for training.
Tuple of (model, tokenizer, peft_config) ready for training.
"""
# Get the memory manager for adaptive loading
memory_manager = get_memory_manager()
model_kwargs = model_kwargs or {}
# Release Ollama models early before we load any models
if torch.cuda.is_available() and args.use_cuda:
release_ollama_models_early()
# Force cleanup memory after releasing Ollama models
memory_manager.cleanup_memory(force=True)
if args.use_unsloth:
from unsloth import FastLanguageModel
bnb_config = None
quant_storage_dtype = None
if (
torch.distributed.is_available()
@ -129,55 +298,142 @@ def create_and_prepare_model(args, data_args, training_args):
):
raise NotImplementedError("Unsloth is not supported in distributed training")
if args.use_4bit_quantization:
compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
quant_storage_dtype = getattr(torch, args.bnb_4bit_quant_storage_dtype)
# Clean up memory before loading model
memory_manager.cleanup_memory()
# Check for CUDA availability and use it if enabled
cuda_available = torch.cuda.is_available()
use_cuda_requested = args.use_cuda
device = "cpu"
bnb_config = BitsAndBytesConfig(
load_in_4bit=args.use_4bit_quantization,
bnb_4bit_quant_type=args.bnb_4bit_quant_type,
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=args.use_nested_quant,
bnb_4bit_quant_storage=quant_storage_dtype,
)
# Always enable memory-adaptive loading by default (device_map="auto"), unless CUDA is off
if cuda_available and use_cuda_requested:
device = "cuda"
model_kwargs["device_map"] = "auto"
else:
if use_cuda_requested and not cuda_available:
logger.warning("⚠️ CUDA was requested but is not available on this system. Falling back to CPU.")
elif cuda_available and not use_cuda_requested:
logger.info(" CUDA is available but not requested. Using CPU as specified.")
else:
logger.info(" CUDA is not available. Using CPU for training.")
# Explicitly remove device_map to force CPU-only
if "device_map" in model_kwargs:
model_kwargs.pop("device_map")
logger.info("Using CPU for model training and inference.")
if compute_dtype == torch.float16 and args.use_4bit_quantization:
major, _ = torch.cuda.get_device_capability()
if major >= 8:
logging.info("=" * 80)
logging.info(
"Your GPU supports bfloat16, you can accelerate training with the argument --bf16"
)
logging.info("=" * 80)
# Configure quantization based on available memory
# Use model_kwargs quantization_config if provided, otherwise build it
if "quantization_config" not in model_kwargs:
if args.use_4bit_quantization:
compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
quant_storage_dtype = getattr(torch, args.bnb_4bit_quant_storage_dtype)
bnb_config = BitsAndBytesConfig(
load_in_4bit=args.use_4bit_quantization,
bnb_4bit_quant_type=args.bnb_4bit_quant_type,
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=args.use_nested_quant,
bnb_4bit_quant_storage=quant_storage_dtype,
)
model_kwargs["quantization_config"] = bnb_config
if compute_dtype == torch.float16 and args.use_4bit_quantization:
major, _ = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0)
if major >= 8:
logger.info("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
elif args.use_8bit_quantization:
bnb_config = BitsAndBytesConfig(load_in_8bit=args.use_8bit_quantization)
model_kwargs["quantization_config"] = bnb_config
if args.use_unsloth:
# Load model
model, _ = FastLanguageModel.from_pretrained(
model_name=args.model_name_or_path,
max_seq_length=data_args.max_seq_length,
dtype=None,
load_in_4bit=args.use_4bit_quantization,
)
else:
if os.getenv("PLATFORM") != "apple":
model = AutoModelForCausalLM.from_pretrained(
args.model_name_or_path,
quantization_config=bnb_config,
trust_remote_code=True,
attn_implementation="flash_attention_2" if args.use_flash_attn else "eager",
torch_dtype=torch.bfloat16
)
# Load model with memory-adaptive approach
model = None
tokenizer = None
peft_config = None
try:
# First try loading the model with the requested configuration
if args.use_unsloth:
# Load model with Unsloth using memory manager
unsloth_kwargs = {
"model_name": args.model_name_or_path,
"max_seq_length": data_args.max_seq_length,
"dtype": None,
"load_in_4bit": args.use_4bit_quantization,
"load_in_8bit": args.use_8bit_quantization,
"trust_remote_code": True,
"device_map": model_kwargs.get("device_map", "auto") if args.use_cuda and torch.cuda.is_available() else None,
}
logger.info(f"Loading model with Unsloth with parameters: {unsloth_kwargs}")
model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs)
else:
# Load model with standard approach
load_kwargs = {
"trust_remote_code": True,
}
# Use any provided model_kwargs
load_kwargs.update(model_kwargs)
if "attn_implementation" not in load_kwargs and args.use_flash_attn:
load_kwargs["attn_implementation"] = "flash_attention_2"
# Set default device_map if not specified
if "device_map" not in load_kwargs and args.use_cuda and torch.cuda.is_available():
load_kwargs["device_map"] = "auto"
logger.info(f"Loading model with parameters: {load_kwargs}")
model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, **load_kwargs)
except (RuntimeError, torch.cuda.OutOfMemoryError, MemoryError) as e:
# If standard approaches fail, try progressive fallbacks
logger.warning(f"Failed to load model with standard settings: {str(e)}")
logger.info("Falling back to adaptive model loading...")
# First cleanup to ensure maximum memory available
memory_manager.cleanup_memory(force=True)
try:
# Try with simpler configuration - float16 instead of bfloat16
logger.info("Attempting to load with float16 precision...")
model = AutoModelForCausalLM.from_pretrained(
args.model_name_or_path,
quantization_config=bnb_config,
device_map="auto" if torch.cuda.is_available() and args.use_cuda else None,
torch_dtype=torch.float16 if torch.cuda.is_available() and args.use_cuda else None,
trust_remote_code=True
)
except (RuntimeError, torch.cuda.OutOfMemoryError, MemoryError) as e:
# If that fails too, try even more conservative loading
logger.warning(f"Float16 loading failed: {str(e)}")
memory_manager.cleanup_memory(force=True)
try:
# Try with CPU offloading and gradual loading
logger.info("Attempting most conservative loading with CPU offloading...")
model = AutoModelForCausalLM.from_pretrained(
args.model_name_or_path,
device_map="auto",
offload_folder="offload_folder",
offload_state_dict=True,
torch_dtype=torch.float16 if torch.cuda.is_available() else None,
trust_remote_code=True,
low_cpu_mem_usage=True
)
except Exception as e:
# If all fallbacks fail, it's a fatal error
logger.error(f"All adaptive loading approaches failed: {str(e)}")
raise RuntimeError(f"Failed to load model with any memory adaptation technique: {str(e)}")
peft_config = None
chat_template = None
# If still not loaded, it's a fatal error
if model is None:
raise RuntimeError("Failed to load model with any memory adaptation technique")
# Apply memory optimization to model
model = memory_manager.optimize_model_for_training(model)
# Configure LoRA if requested
if args.use_peft_lora and not args.use_unsloth:
peft_config = LoraConfig(
lora_alpha=args.lora_alpha,
@ -190,6 +446,7 @@ def create_and_prepare_model(args, data_args, training_args):
else args.lora_target_modules,
)
# Load tokenizer - tokenizers are usually small and don't need memory management
special_tokens = None
chat_template = None
if args.chat_template_format == "chatml":
@ -214,22 +471,43 @@ def create_and_prepare_model(args, data_args, training_args):
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path, trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token
# Make sure pad_token is set
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Apply Unsloth LoRA if requested and check memory status
if args.use_unsloth:
# Do model patching and add fast LoRA weights
model = FastLanguageModel.get_peft_model(
model,
lora_alpha=args.lora_alpha,
lora_dropout=args.lora_dropout,
r=args.lora_r,
target_modules=args.lora_target_modules.split(",")
if args.lora_target_modules != "all-linear"
else args.lora_target_modules,
use_gradient_checkpointing=training_args.gradient_checkpointing,
random_state=training_args.seed,
max_seq_length=data_args.max_seq_length,
)
try:
# Clean up first
memory_manager.cleanup_memory()
# Apply LoRA with memory monitoring
model = FastLanguageModel.get_peft_model(
model,
lora_alpha=args.lora_alpha,
lora_dropout=args.lora_dropout,
r=args.lora_r,
target_modules=args.lora_target_modules.split(",")
if args.lora_target_modules != "all-linear"
else args.lora_target_modules,
use_gradient_checkpointing=training_args.gradient_checkpointing,
random_state=training_args.seed,
max_seq_length=data_args.max_seq_length,
)
except Exception as e:
logger.error(f"Failed to apply Unsloth LoRA: {str(e)}")
# If Unsloth fails, we might need to fall back to standard training
if args.use_cuda and torch.cuda.is_available():
logger.warning("Low VRAM detected, moving model to CPU")
model = model.cpu()
torch.cuda.empty_cache()
# Final memory status check
memory_info = memory_manager.get_memory_info()
logger.info(f"Memory after model preparation: RAM: {memory_info['ram_used_gb']:.2f}GB / {memory_info['ram_total_gb']:.2f}GB")
if torch.cuda.is_available():
logger.info(f"VRAM: {memory_info.get('vram_used_gb', 0):.2f}GB / {memory_info.get('vram_total_gb', 0):.2f}GB")
return model, peft_config, tokenizer
@ -343,11 +621,11 @@ def setup_logger(log_path, logger_name="download_logger"):
return logger
def save_hf_model(model_name="Qwen2.5-0.5B-Instruct", log_file_path=None) -> str:
def save_hf_model(model_name=None, log_file_path=None) -> str:
"""Saves a Hugging Face model locally.
Args:
model_name: Name of the model to save. Defaults to "Qwen2.5-0.5B-Instruct".
model_name: Name of the model to save. If None, will attempt to get from config.
log_file_path: Path to save download logs. If None, uses default path.
Returns:
@ -360,25 +638,39 @@ def save_hf_model(model_name="Qwen2.5-0.5B-Instruct", log_file_path=None) -> str
# Setup logging
logger = setup_logger(log_file_path)
# If no model name provided, attempt to get from training configuration
if not model_name:
try:
from lpm_kernel.configs.config import Config
config = Config()
model_name = config.get("training", {}).get("model_name")
if not model_name:
logger.warning("No model name provided and none found in config. Using Qwen2.5-0.5B-Instruct as fallback.")
model_name = "Qwen2.5-0.5B-Instruct"
except Exception as e:
logger.warning(f"Failed to get model name from config: {str(e)}. Using Qwen2.5-0.5B-Instruct as fallback.")
model_name = "Qwen2.5-0.5B-Instruct"
base_dir = os.path.join(os.getcwd(), "resources/L2/base_models")
# Normalize model name and check for path traversal attempts
normalized_model_name = os.path.normpath(model_name)
if ".." in normalized_model_name or normalized_model_name.startswith("/"):
raise ValueError("Invalid model name")
raise ValueError("Invalid model name - potential path traversal attempt")
# Prepare save path
save_path = os.path.join(base_dir, normalized_model_name)
os.makedirs(save_path, exist_ok=True)
from huggingface_hub import list_repo_files, configure_http_backend
import requests
from huggingface_hub import list_repo_files, configure_http_backend, hf_hub_download
from tqdm import tqdm
from tqdm.contrib.concurrent import thread_map
import shutil
from concurrent.futures import ThreadPoolExecutor
import traceback
# Set a higher timeout, but remove the unsupported pool_size parameter
# Configure HTTP backend more simply
try:
# Try using the timeout parameter to configure
configure_http_backend(timeout=100.0)
except TypeError:
# If the timeout parameter is also not supported, do not use any parameters
except Exception as e:
logger.warning(f"Failed to configure HTTP backend with timeout: {e}")
try:
configure_http_backend()
except Exception as e:
@ -391,31 +683,31 @@ def save_hf_model(model_name="Qwen2.5-0.5B-Instruct", log_file_path=None) -> str
hf_model_name = f"Qwen/{model_name}"
try:
# First get the list of all files in the repository
# Get list of files to download
files = list_repo_files(hf_model_name)
logger.info(f"Found {len(files)} files to download from {hf_model_name}")
# Define a function for downloading a single file and recording progress
def download_file_with_progress(file_info):
filename, file_path = file_info
def download_file_with_progress(filename):
"""Download a single file from the model repository"""
local_file_path = os.path.join(save_path, filename)
# Create directories if they don't exist
os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
# Check if file already exists and is not empty
if os.path.exists(local_file_path) and os.path.getsize(local_file_path) > 0:
logger.info(f"File already exists: {filename}")
return True
try:
# Build the download URL
url = f"https://huggingface.co/{hf_model_name}/resolve/main/{filename}"
# Target file path
local_file_path = os.path.join(save_path, filename)
os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
# Check if the file already exists
if os.path.exists(local_file_path):
logger.info(f"File already exists: {filename}")
return filename, True
# Get file size
response = requests.head(url)
total_size = int(response.headers.get('content-length', 0))
# If the size cannot be obtained, set a default value or do not display the percentage
# If the size cannot be obtained, set a default value
if total_size == 0:
logger.info(f"Starting download of file: {filename} (Size unknown)")
else:
@ -423,7 +715,7 @@ def save_hf_model(model_name="Qwen2.5-0.5B-Instruct", log_file_path=None) -> str
# Create the file to write to
with open(local_file_path, 'wb') as f:
# Create a progress bar, if the total size is unknown, set it to None
# Create a progress bar
progress_bar = tqdm(
total=total_size if total_size > 0 else None,
unit='iB',
@ -432,84 +724,87 @@ def save_hf_model(model_name="Qwen2.5-0.5B-Instruct", log_file_path=None) -> str
disable=False
)
# Define the progress callback function
# Define progress callback
def progress_callback(current, total):
# Update the progress bar
progress_bar.update(current - progress_bar.n)
# Record the log every 1MB (or a value close to 1MB)
if current % (1024 * 1024) < 8192: # Record every 1MB
# Ensure total is greater than 0 before calculating the percentage
if total and total > 0: # Use and to ensure total is not None and greater than 0
# Log progress every ~1MB
if current % (1024 * 1024) < 8192:
if total and total > 0:
percent = current / total * 100
logger.info(f"File {filename}: Downloaded {current/1024/1024:.2f} MB / {total/1024/1024:.2f} MB ({percent:.2f}%)")
else:
# If the total size is unknown or 0, only show the downloaded size
logger.info(f"File {filename}: Downloaded {current/1024/1024:.2f} MB (total size unknown)")
# Use the request library to download the file and update the progress
# Download file with progress tracking
response = requests.get(url, stream=True)
if response.status_code == 200:
downloaded = 0
# Check if the response contains the Content-Length header information
# Update total size if needed
actual_total = int(response.headers.get('content-length', 0))
if actual_total > 0 and (total_size == 0 or total_size != actual_total):
# If the HEAD request did not return the correct size, but the GET request did, then update the total size
total_size = actual_total
logger.info(f"Updated file size for {filename}: {total_size / 1024 / 1024:.2f} MB")
progress_bar.total = total_size
progress_bar.refresh()
for chunk in response.iter_content(chunk_size=8192):
if chunk: # Filter out empty chunks that keep the connection alive
if chunk:
f.write(chunk)
downloaded += len(chunk)
progress_callback(downloaded, total_size)
progress_bar.close()
logger.info(f"Completed download of file: {filename}")
return filename, True
return True
else:
logger.error(f"Failed to download {filename}: HTTP status {response.status_code}")
return filename, False
failed_files.append(filename)
return False
except Exception as e:
logger.error(f"Error downloading {filename}: {str(e)}")
return filename, False
logger.error(f"Failed to download {filename}: {str(e)}")
failed_files.append(filename)
return False
# Keep track of failed files for potential retry
failed_files = []
# Create a list of file path information
file_infos = [(filename, os.path.join(save_path, filename)) for filename in files]
# Use a thread pool to download all files in parallel
logger.info(f"Starting parallel download of {len(file_infos)} files")
# Use a thread pool to download all files in parallel
from concurrent.futures import ThreadPoolExecutor
# Limit the number of concurrent requests to avoid too many requests
max_workers = min(10, len(file_infos))
results = []
# Use ThreadPoolExecutor for parallel downloads with controlled concurrency
max_workers = min(8, len(files)) # Limit concurrent downloads to avoid overloading
successful_downloads = 0
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all download tasks
future_to_file = {executor.submit(download_file_with_progress, file_info): file_info[0]
for file_info in file_infos}
# Wait for all tasks to complete and collect results
for future in tqdm(future_to_file, desc="Overall Progress", unit="file"):
filename = future_to_file[future]
try:
# Create a progress bar for overall download progress
with tqdm(total=len(files), desc="Downloading model files") as progress:
futures = [executor.submit(download_file_with_progress, file) for file in files]
for future in futures:
result = future.result()
results.append(result)
logger.info(f"Finished processing {filename}")
except Exception as exc:
logger.error(f'{filename} generated an exception: {exc}')
results.append((filename, False))
if result:
successful_downloads += 1
progress.update(1)
# Report progress periodically
if progress.n % 5 == 0 or progress.n == len(files):
logger.info(f"Downloaded {progress.n}/{len(files)} files ({successful_downloads} successful)")
# Check the download results
success_count = sum(1 for _, success in results if success)
logger.info(f"Download completed. Successfully downloaded {success_count}/{len(files)} files.")
# Handle any failed downloads
if failed_files:
logger.warning(f"Failed to download {len(failed_files)} files. First few: {failed_files[:5]}")
# If most files failed, there might be an issue with the model repository
if len(failed_files) > len(files) * 0.5:
logger.error(f"More than 50% of files failed to download. There might be an issue with the model repository.")
raise RuntimeError("Too many files failed to download")
# If critical files failed (like model weights or config), warn specifically
critical_patterns = ['model.safetensors', 'config.json', 'tokenizer.json']
critical_failed = [f for f in failed_files if any(pattern in f for pattern in critical_patterns)]
if critical_failed:
logger.error(f"Failed to download critical files: {critical_failed}")
raise RuntimeError(f"Failed to download critical model files: {', '.join(critical_failed)}")
# Record the download completion information
try:
@ -527,15 +822,14 @@ def save_hf_model(model_name="Qwen2.5-0.5B-Instruct", log_file_path=None) -> str
raise
except KeyboardInterrupt:
logger.warning(f"Download interrupted by user for model: {model_name}")
# Clean up partial downloads
raise
except Exception as e:
# Log any errors that occur
logger.error(f"Error downloading model: {str(e)}")
logger.error(traceback.format_exc())
raise
return save_path
def format_timestr(utc_time_str):
"""Formats a UTC time string to a more readable format.

View File

@ -349,3 +349,56 @@ def get_document_embedding(document_id: int):
return jsonify(
APIResponse.error(message=f"Error getting document embedding: {str(e)}")
)
@document_bp.route("/documents/verify-embeddings", methods=["GET"])
def verify_document_embeddings():
"""Verify all document embeddings and return statistics"""
try:
verbose = request.args.get("verbose", "").lower() == "true"
results = document_service.verify_document_embeddings(verbose=verbose)
return jsonify(APIResponse.success(data=results))
except Exception as e:
logger.error(f"Error verifying document embeddings: {str(e)}", exc_info=True)
return jsonify(APIResponse.error(message=f"Error verifying document embeddings: {str(e)}"))
@document_bp.route("/documents/repair", methods=["POST"])
def repair_documents():
"""Repair documents with missing analysis and embeddings"""
try:
# First, fix missing document analysis (summaries and insights)
fixed_analysis_count = document_service.fix_missing_document_analysis()
# Get verification results after fixing analysis
verification_results = document_service.verify_document_embeddings(verbose=False)
# Process documents with missing embeddings
documents_fixed = 0
for doc in document_service._repository.list():
embedding = document_service.get_document_embedding(doc.id)
if doc.raw_content and embedding is None:
try:
document_service.process_document_embedding(doc.id)
# Also process chunk embeddings
document_service.generate_document_chunk_embeddings(doc.id)
documents_fixed += 1
except Exception as e:
logger.error(f"Error processing document {doc.id} embedding: {str(e)}")
# Get final verification results
final_results = document_service.verify_document_embeddings(verbose=False)
return jsonify(APIResponse.success(
data={
"analysis_fixed": fixed_analysis_count,
"embeddings_fixed": documents_fixed,
"initial_state": verification_results,
"final_state": final_results
}
))
except Exception as e:
logger.error(f"Error repairing documents: {str(e)}", exc_info=True)
return jsonify(APIResponse.error(message=f"Error repairing documents: {str(e)}"))

View File

@ -3,6 +3,8 @@ import logging
import os
import time
import sys
import torch # Add torch import for CUDA detection
import traceback
from dataclasses import asdict
from flask import Blueprint, jsonify, Response, request
@ -352,6 +354,19 @@ def train2():
model_name = data["model_name"]
paths = get_model_paths(model_name)
# Get optional parameters with defaults
learning_rate = data.get("learning_rate", 2e-4)
num_train_epochs = data.get("number_of_epochs", 3)
concurrency_threads = data.get("concurrency_threads", 2)
data_synthesis_mode = data.get("data_synthesis_mode", "low")
use_cuda = data.get("use_cuda", False)
# Convert use_cuda to string "True" or "False" for the shell script
use_cuda_str = "True" if use_cuda else "False"
logger.info(f"Training configuration: learning_rate={learning_rate}, epochs={num_train_epochs}, "
f"threads={concurrency_threads}, mode={data_synthesis_mode}, use_cuda={use_cuda} ({use_cuda_str})")
# Check if model exists
if not os.path.exists(paths["base_path"]):
return jsonify(APIResponse.error(
@ -385,29 +400,61 @@ def train2():
script_path = os.path.join(os.getcwd(), "lpm_kernel/L2/train_for_user.sh")
# Build command arguments
cmd_args = [
"--lr", str(learning_rate),
"--epochs", str(num_train_epochs),
"--threads", str(concurrency_threads),
"--mode", str(data_synthesis_mode),
"--cuda", use_cuda_str # Use the properly formatted string
]
# Start training
import threading
_training_thread = threading.Thread(
target=start_training,
args=(script_path, log_path),
target=start_training_with_args,
args=(script_path, log_path, cmd_args),
daemon=True
)
_training_thread.start()
return jsonify(APIResponse.success(
data={
"status": "training_started",
"model_name": model_name,
"log_path": log_path,
"personal_dir": paths["personal_dir"],
"merged_dir": paths["merged_dir"]
},
message="Training task started"
message="Training task started successfully"
))
except Exception as e:
error_msg = f"Failed to start training: {str(e)}"
logger.error(error_msg)
return jsonify(APIResponse.error(message=error_msg, code=500))
logger.error(f"Error starting training task: {str(e)}")
traceback.print_exc()
return jsonify(APIResponse.error(message=f"Failed to start training: {str(e)}"))
def start_training_with_args(script_path: str, log_path: str, args: list) -> None:
"""Start training with additional arguments"""
global _training_process
try:
# Convert script path and args to a command
cmd = [script_path] + args
# Use ScriptRunner to execute the script
runner = ScriptRunner(log_path=log_path)
_training_process = runner.execute_script(
script_path=script_path,
script_type="training",
is_python=False, # This is a bash script
args=args
)
logger.info(f"Training process started with args: {args}, process: {_training_process}")
except Exception as e:
logger.error(f"Failed to start training process: {str(e)}")
_training_process = None
raise
@kernel2_bp.route("/merge_weights", methods=["POST"])
@ -550,6 +597,9 @@ def start_llama_server():
return jsonify(APIResponse.error(message="Missing required parameter: model_name", code=400))
model_name = data["model_name"]
# Get optional use_gpu parameter with default value of True
use_gpu = data.get("use_gpu", True)
paths = get_model_paths(model_name)
gguf_path = os.path.join(paths["gguf_dir"], "model.gguf")
@ -564,53 +614,34 @@ def start_llama_server():
server_executable = "llama-server"
server_path = os.path.join(server_path, server_executable)
# Check if service and model file exist
if not os.path.exists(server_path):
return jsonify(APIResponse.error(message="llama-server executable file does not exist", code=400))
# Check if model file exists
if not os.path.exists(gguf_path):
return jsonify(APIResponse.error(
message=f"Model '{model_name}' GGUF file does not exist, please convert model first",
code=400
))
# Check if service is already running
# Start the server using the LocalLLMService with GPU acceleration if requested
success = local_llm_service.start_server(gguf_path, use_gpu=use_gpu)
if not success:
return jsonify(APIResponse.error(message="Failed to start llama-server", code=500))
# Get updated service status
status = local_llm_service.get_server_status()
if status.is_running:
return jsonify(
APIResponse.error(
message=f"llama-server is already running, PID: {status.process_info.pid}",
code=400
)
)
# Build parameters
args = [server_path, "-m", gguf_path, "--port", "8080"]
# Use thread to start service asynchronously
def start_server():
script_executor.execute(
script_path=server_path,
script_type="llama_server",
args=args[1:], # Remove first parameter (executable file path)
shell=False,
)
# Start new thread to run service
from threading import Thread
thread = Thread(target=start_server)
thread.daemon = True
thread.start()
# Return start status immediately
# Return success response with GPU info
gpu_info = "with GPU acceleration" if use_gpu and torch.cuda.is_available() else "with CPU only"
return jsonify(
APIResponse.success(
data={
"model_name": model_name,
"gguf_path": gguf_path,
"status": "starting"
"status": "running" if status.is_running else "starting",
"use_gpu": use_gpu and torch.cuda.is_available(),
"gpu_info": gpu_info
},
message="llama-server service is starting"
message=f"llama-server service started {gpu_info}"
)
)
@ -805,3 +836,31 @@ def chat(body: ChatRequest):
if not getattr(body, 'stream', True): # Default to stream if attribute missing
return jsonify(error_response), 500
return local_llm_service.handle_stream_response(iter([error_response]))
@kernel2_bp.route("/cuda/available", methods=["GET"])
def check_cuda_available():
"""Check if CUDA is available for model training/inference"""
try:
import torch
cuda_available = torch.cuda.is_available()
cuda_info = {}
if cuda_available:
cuda_info = {
"device_count": torch.cuda.device_count(),
"current_device": torch.cuda.current_device(),
"device_name": torch.cuda.get_device_name(0)
}
return jsonify(APIResponse.success(
data={
"cuda_available": cuda_available,
"cuda_info": cuda_info
},
message="CUDA availability check completed"
))
except Exception as e:
error_msg = f"Error checking CUDA availability: {str(e)}"
logger.error(error_msg)
return jsonify(APIResponse.error(message=error_msg, code=500))

View File

@ -24,6 +24,7 @@ def start_process():
number_of_epochs: Number of training epochs (optional)
concurrency_threads: Number of threads for concurrent processing (optional)
data_synthesis_mode: Mode for data synthesis (optional)
use_cuda: Whether to use CUDA for training (optional)
Includes the following steps:
1. Health check
@ -63,6 +64,7 @@ def start_process():
number_of_epochs = data.get("number_of_epochs", None)
concurrency_threads = data.get("concurrency_threads", None)
data_synthesis_mode = data.get("data_synthesis_mode", None)
use_cuda = data.get("use_cuda", False) # Default to False if not provided
is_cot = data.get("is_cot", None)
# Log the received parameters
@ -90,6 +92,7 @@ def start_process():
"number_of_epochs": number_of_epochs,
"concurrency_threads": concurrency_threads,
"data_synthesis_mode": data_synthesis_mode,
"use_cuda": use_cuda, # Make sure to include use_cuda parameter
"is_cot": is_cot
}
@ -113,6 +116,7 @@ def start_process():
"number_of_epochs": number_of_epochs,
"concurrency_threads": concurrency_threads,
"data_synthesis_mode": data_synthesis_mode,
"use_cuda": use_cuda, # Include in response
"is_cot": is_cot
}
)

View File

@ -4,6 +4,7 @@ import logging
import psutil
import time
import subprocess
import torch # Add torch import for CUDA detection
import threading
import queue
from typing import Iterator, Any, Optional, Generator, Dict
@ -38,9 +39,16 @@ class LocalLLMService:
)
return self._client
def start_server(self, model_path: str) -> bool:
def start_server(self, model_path: str, use_gpu: bool = True) -> bool:
"""
Start the llama-server service
Start the llama-server service with GPU acceleration when available
Args:
model_path: Path to the GGUF model file
use_gpu: Whether to use GPU acceleration if available
Returns:
bool: True if server started successfully, False otherwise
"""
try:
# Check if server is already running
@ -49,27 +57,163 @@ class LocalLLMService:
logger.info("LLama server is already running")
return True
# Start server
# Check for CUDA availability if GPU was requested
cuda_available = torch.cuda.is_available() if use_gpu else False
gpu_info = ""
if use_gpu and cuda_available:
gpu_device = torch.cuda.current_device()
gpu_info = f" using GPU: {torch.cuda.get_device_name(gpu_device)}"
gpu_memory = torch.cuda.get_device_properties(gpu_device).total_memory / (1024**3)
logger.info(f"CUDA is available. Using GPU acceleration{gpu_info}")
logger.info(f"CUDA device capabilities: {torch.cuda.get_device_capability(gpu_device)}")
logger.info(f"CUDA memory: {gpu_memory:.2f} GB")
# Pre-initialize CUDA to speed up first inference
logger.info("Pre-initializing CUDA context to speed up first inference")
torch.cuda.init()
torch.cuda.empty_cache()
elif use_gpu and not cuda_available:
logger.warning("CUDA was requested but is not available. Using CPU instead.")
else:
logger.info("Using CPU for inference (GPU not requested)")
# Check for GPU optimization marker
gpu_optimized = False
model_dir = os.path.dirname(model_path)
gpu_marker_path = os.path.join(model_dir, "gpu_optimized.json")
if os.path.exists(gpu_marker_path):
try:
with open(gpu_marker_path, 'r') as f:
gpu_data = json.load(f)
if gpu_data.get("gpu_optimized", False):
gpu_optimized = True
logger.info(f"Found GPU optimization marker created on {gpu_data.get('optimized_on', 'unknown date')}")
except Exception as e:
logger.warning(f"Error reading GPU marker file: {e}")
# Get the correct path to the llama-server executable
base_dir = os.getcwd()
server_path = os.path.join(base_dir, "llama.cpp", "build", "bin", "llama-server")
# For Windows, add .exe extension if needed
if os.name == 'nt' and not server_path.endswith('.exe'):
server_path += '.exe'
# Verify executable exists
if not os.path.exists(server_path):
logger.error(f"llama-server executable not found at: {server_path}")
return False
# Start server with optimal parameters for faster startup
cmd = [
"llama-server",
server_path,
"-m", model_path,
"--host", "0.0.0.0",
"--port", "8000"
"--port", "8080",
"--ctx-size", "2048", # Default context size (adjust based on needs)
"--parallel", "2", # Enable request parallelism
"--cont-batching" # Enable continuous batching
]
# Set up environment with CUDA variables to ensure GPU detection
env = os.environ.copy()
# Add GPU-related parameters if CUDA is available
if cuda_available and use_gpu:
# Force GPU usage with optimal parameters for faster loads
cmd.extend([
"--n-gpu-layers", "999", # Use all layers on GPU
"--tensor-split", "0", # Use the first GPU for all operations
"--main-gpu", "0", # Use GPU 0 as the primary device
"--mlock" # Lock memory to prevent swapping during inference
])
# Set CUDA environment variables to help with GPU detection
env["CUDA_VISIBLE_DEVICES"] = "0" # Force using first GPU
# Ensure comprehensive library paths for CUDA
cuda_lib_paths = [
"/usr/local/cuda/lib64",
"/usr/lib/cuda/lib64",
"/usr/local/lib",
"/usr/lib/x86_64-linux-gnu",
"/usr/lib/wsl/lib" # For Windows WSL environments
]
# Build a comprehensive LD_LIBRARY_PATH
current_ld_path = env.get("LD_LIBRARY_PATH", "")
for path in cuda_lib_paths:
if os.path.exists(path) and path not in current_ld_path:
current_ld_path = f"{path}:{current_ld_path}" if current_ld_path else path
env["LD_LIBRARY_PATH"] = current_ld_path
logger.info(f"Setting LD_LIBRARY_PATH to: {current_ld_path}")
# If this is Windows, use different approach for CUDA libraries
if os.name == 'nt':
# Windows typically has CUDA in PATH already if installed
logger.info("Windows system detected, using system CUDA libraries")
else:
# On Linux, try to find CUDA libraries in common locations
for cuda_path in [
# Common CUDA paths
"/usr/local/cuda/lib64",
"/usr/lib/cuda/lib64",
"/usr/local/lib/python3.12/site-packages/nvidia/cuda_runtime/lib",
"/usr/local/lib/python3.10/site-packages/nvidia/cuda_runtime/lib",
]:
if os.path.exists(cuda_path):
# Add CUDA path to library path
env["LD_LIBRARY_PATH"] = f"{cuda_path}:{env.get('LD_LIBRARY_PATH', '')}"
env["CUDA_HOME"] = os.path.dirname(cuda_path)
logger.info(f"Found CUDA at {cuda_path}, setting environment variables")
break
# NOTE: CUDA support and rebuild should be handled at build/setup time (e.g., Docker build or setup script).
# The runtime check and rebuild logic has been removed for efficiency and reliability.
# Ensure llama.cpp is built with CUDA support before running the server if GPU is required.
# Pre-heat GPU to ensure faster initial response
if torch.cuda.is_available():
logger.info("Pre-warming GPU to reduce initial latency...")
dummy_tensor = torch.zeros(1, 1).cuda()
del dummy_tensor
torch.cuda.synchronize()
torch.cuda.empty_cache()
logger.info("GPU warm-up complete")
logger.info("Using GPU acceleration for inference with optimized settings")
else:
# If GPU isn't available or supported, optimize for CPU
cmd.extend([
"--threads", str(max(1, os.cpu_count() - 1)), # Use all CPU cores except one
])
logger.info(f"Using CPU-only mode with {max(1, os.cpu_count() - 1)} threads")
logger.info(f"Starting llama-server with command: {' '.join(cmd)}")
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True
universal_newlines=True,
env=env
)
# Wait for server to start
time.sleep(2)
# Wait for server to start (longer wait for GPU initialization)
wait_time = 5 if cuda_available and use_gpu else 3
logger.info(f"Waiting {wait_time} seconds for server to start...")
time.sleep(wait_time)
# Check if process started successfully
# Check if process is still running
if process.poll() is None:
logger.info("LLama server started successfully")
# Log initialization success
if cuda_available and use_gpu:
logger.info(f"✅ LLama server started successfully with GPU acceleration{gpu_info}")
else:
logger.info("✅ LLama server started successfully in CPU-only mode")
return True
else:
stdout, stderr = process.communicate()
@ -151,10 +295,15 @@ class LocalLLMService:
Returns: ServerStatus object
"""
try:
base_dir = os.getcwd()
server_path = os.path.join(base_dir, "llama.cpp", "build", "bin", "llama-server")
server_exec_name = os.path.basename(server_path)
for proc in psutil.process_iter(["pid", "name", "cmdline"]):
try:
cmdline = proc.cmdline()
if any("llama-server" in cmd for cmd in cmdline):
# Check both for the executable name and the full path
if any(server_exec_name in cmd for cmd in cmdline) or any("llama-server" in cmd for cmd in cmdline):
with proc.oneshot():
process_info = ProcessInfo(
pid=proc.pid,

View File

@ -6,6 +6,7 @@ from .api.file_server.handler import FileServerHandler
from .database.migration_manager import MigrationManager
import os
import atexit
import subprocess
def create_app():

View File

@ -7,6 +7,13 @@ from lpm_kernel.configs.logging import get_train_process_logger
logger = get_train_process_logger()
import lpm_kernel.common.strategy.classification as classification
from sentence_transformers import SentenceTransformer
import json
class EmbeddingError(Exception):
"""Custom exception class for embedding-related errors"""
def __init__(self, message, original_error=None):
super().__init__(message)
self.original_error = original_error
class LLMClient:
"""LLM client utility class"""
@ -55,10 +62,8 @@ class LLMClient:
user_llm_config = self.user_llm_config_service.get_available_llm()
if not user_llm_config:
raise Exception("No LLM configuration found")
# Prepare request data
raise EmbeddingError("No LLM configuration found")
try:
# Send request to embedding endpoint
embeddings_array = classification.strategy_classification(user_llm_config, chunked_texts)
@ -81,7 +86,25 @@ class LLMClient:
return embeddings_array
except requests.exceptions.RequestException as e:
raise Exception(f"Failed to get embeddings: {str(e)}")
# Handle request errors
error_msg = f"Request error getting embeddings: {str(e)}"
logger.error(error_msg)
raise EmbeddingError(error_msg, e)
except json.JSONDecodeError as e:
# Handle JSON parsing errors
error_msg = f"Invalid JSON response from embedding API: {str(e)}"
logger.error(error_msg)
raise EmbeddingError(error_msg, e)
except (KeyError, IndexError, ValueError) as e:
# Handle response structure errors
error_msg = f"Invalid response structure from embedding API: {str(e)}"
logger.error(error_msg)
raise EmbeddingError(error_msg, e)
except Exception as e:
# Fallback for any other errors
error_msg = f"Unexpected error getting embeddings: {str(e)}"
logger.error(error_msg, exc_info=True)
raise EmbeddingError(error_msg, e)
@property
def chat_credentials(self):

View File

@ -148,6 +148,37 @@ class DocumentService:
self._update_analyze_status_failed(doc.id)
raise
def analyze_document(self, document_id: int) -> DocumentDTO:
"""
Analyze a single document by ID
Args:
document_id (int): ID of document to analyze
Returns:
DocumentDTO: The analyzed document
Raises:
ValueError: If document not found
Exception: If analysis fails
"""
try:
# Get document
document = self._repository.find_one(document_id)
if not document:
raise ValueError(f"Document not found with id: {document_id}")
# Perform analysis
return self._analyze_document(document)
except ValueError as e:
logger.error(f"Document {document_id} not found: {str(e)}")
raise
except Exception as e:
logger.error(f"Error analyzing document {document_id}: {str(e)}", exc_info=True)
self._update_analyze_status_failed(document_id)
raise
def _update_analyze_status_failed(self, doc_id: int) -> None:
"""update status as failed"""
try:
@ -593,6 +624,120 @@ class DocumentService:
logger.error(f"Error deleting file: {str(e)}", exc_info=True)
raise
def fix_missing_document_analysis(self) -> int:
"""Fix documents with missing insights or summaries
Returns:
int: Number of documents fixed
"""
try:
# Find all documents that have analysis issues
docs = self._repository.list()
fixed_count = 0
for doc in docs:
needs_fixing = False
# Check if document needs analysis
if not doc.analyze_status or doc.analyze_status != ProcessStatus.SUCCESS:
needs_fixing = True
logger.info(f"Document {doc.id} needs analysis (status: {doc.analyze_status})")
# Check if document has missing insights or summaries
elif not doc.insight or not doc.summary:
needs_fixing = True
logger.info(f"Document {doc.id} has missing insight or summary")
# Process documents that need fixing
if needs_fixing:
try:
# Process document analysis
self.analyze_document(doc.id)
fixed_count += 1
logger.info(f"Fixed document {doc.id} analysis")
except Exception as e:
logger.error(f"Error fixing document {doc.id} analysis: {str(e)}")
logger.info(f"Fixed {fixed_count} documents with missing analysis")
return fixed_count
except Exception as e:
logger.error(f"Error in fix_missing_document_analysis: {str(e)}")
raise FileProcessingError(f"Failed to fix document analysis: {str(e)}")
def verify_document_embeddings(self, verbose=True) -> Dict:
"""
Verify all document embeddings and return statistics
Args:
verbose (bool): Whether to log detailed information
Returns:
Dict: Statistics about document embeddings
"""
try:
docs = self._repository.list()
results = {
"total_documents": len(docs),
"documents_with_embedding": 0,
"documents_without_embedding": 0,
"documents_with_content": 0,
"documents_without_content": 0,
"documents_with_summary": 0,
"documents_without_summary": 0,
"documents_with_insight": 0,
"documents_without_insight": 0,
"documents_needing_repair": 0,
}
documents_needing_repair = []
for doc in docs:
# Check if document has content
if doc.raw_content:
results["documents_with_content"] += 1
else:
results["documents_without_content"] += 1
# Check if document has summary
if doc.summary:
results["documents_with_summary"] += 1
else:
results["documents_without_summary"] += 1
# Check if document has insight
if doc.insight:
results["documents_with_insight"] += 1
else:
results["documents_without_insight"] += 1
# Check if embeddings exist in ChromaDB
embedding = self.get_document_embedding(doc.id)
if embedding is not None:
results["documents_with_embedding"] += 1
if verbose:
logger.info(f"Document {doc.id}: '{doc.name}' has embedding of dimension {len(embedding)}")
else:
results["documents_without_embedding"] += 1
if verbose:
logger.warning(f"Document {doc.id}: '{doc.name}' missing embedding")
# Check if document needs repair (has content but missing embedding or analysis)
if doc.raw_content and (embedding is None or not doc.summary or not doc.insight):
documents_needing_repair.append(doc.id)
results["documents_needing_repair"] += 1
# Log statistics
logger.info(f"Document embedding verification results: {results}")
if documents_needing_repair and verbose:
logger.info(f"Documents needing repair: {documents_needing_repair}")
return results
except Exception as e:
logger.error(f"Error verifying document embeddings: {str(e)}", exc_info=True)
raise
# create service
document_service = DocumentService()

View File

@ -418,4 +418,4 @@ class EmbeddingService:
raise
except Exception as e:
logger.error(f"Error searching similar chunks: {str(e)}")
raise
raise

View File

@ -23,6 +23,7 @@ class TrainingParamsManager:
"number_of_epochs": 3,
"concurrency_threads": 2,
"data_synthesis_mode": "low",
"use_cuda": False, # Default to using CUDA when available
"is_cot": False
}

View File

@ -42,7 +42,7 @@ class TrainProcessService:
_instance = None
_initialized = False
def __new__(cls, *args, **kwargs):
if cls._instance is None:
cls._instance = super().__new__(cls)
@ -633,6 +633,7 @@ class TrainProcessService:
num_train_epochs = training_params.get("number_of_epochs")
concurrency_threads = training_params.get("concurrency_threads")
data_synthesis_mode = training_params.get("data_synthesis_mode")
use_cuda = training_params.get("use_cuda", False)
is_cot = training_params.get("is_cot", False)
# Log training parameters
@ -641,6 +642,8 @@ class TrainProcessService:
logger.info(f" Number of epochs: {num_train_epochs}")
logger.info(f" Concurrency threads: {concurrency_threads}")
logger.info(f" Data synthesis mode: {data_synthesis_mode}")
logger.info(f" Use CUDA: {use_cuda}")
logger.info(f" Is CoT: {is_cot}")
# Prepare arguments for the script
# Build command line arguments, need to include script path as the first parameter
@ -650,6 +653,7 @@ class TrainProcessService:
"--epochs", str(num_train_epochs),
"--threads", str(concurrency_threads),
"--mode", str(data_synthesis_mode),
"--cuda", str(use_cuda),
"--is_cot", str(is_cot)
]

69
scripts/prompt_cuda.bat Normal file
View File

@ -0,0 +1,69 @@
@echo off
REM Script to prompt user for CUDA support preference
echo === CUDA Support Selection ===
echo.
echo Do you want to build with NVIDIA GPU (CUDA) support?
echo This requires an NVIDIA GPU and proper NVIDIA Docker runtime configuration.
echo.
set /p choice="Build with CUDA support? (y/n): "
if /i "%choice%"=="y" goto cuda
if /i "%choice%"=="yes" goto cuda
goto nocuda
:cuda
echo Selected: Build WITH CUDA support
REM Create or update .env file with the Dockerfile selection
if exist .env (
REM Check if variable already exists in file
findstr /c:"DOCKER_BACKEND_DOCKERFILE" .env >nul
if %ERRORLEVEL% EQU 0 (
REM Update existing variable
powershell -Command "(Get-Content .env) -replace '^DOCKER_BACKEND_DOCKERFILE=.*', 'DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda' | Set-Content .env"
) else (
REM Append to file with newline before
echo.>> .env
echo DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda>> .env
)
) else (
REM Create new file
echo DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda> .env
)
REM Create a flag file to indicate GPU use
echo GPU > .gpu_selected
echo Environment set to build with CUDA support
goto end
:nocuda
echo Selected: Build WITHOUT CUDA support (CPU only)
REM Create or update .env file with the Dockerfile selection
if exist .env (
REM Check if variable already exists in file
findstr /c:"DOCKER_BACKEND_DOCKERFILE" .env >nul
if %ERRORLEVEL% EQU 0 (
REM Update existing variable
powershell -Command "(Get-Content .env) -replace '^DOCKER_BACKEND_DOCKERFILE=.*', 'DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend' | Set-Content .env"
) else (
REM Append to file with newline before
echo.>> .env
echo DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend>> .env
)
) else (
REM Create new file
echo DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend> .env
)
REM Remove any GPU flag file if it exists
if exist .gpu_selected (
del .gpu_selected
)
echo Environment set to build without CUDA support
:end
echo === CUDA Selection Complete ===

62
scripts/prompt_cuda.sh Normal file
View File

@ -0,0 +1,62 @@
#!/bin/bash
# Script to prompt user for CUDA support preference and directly build with the appropriate Dockerfile
echo "=== CUDA Support Selection ==="
echo ""
echo "Do you want to build with NVIDIA GPU (CUDA) support?"
echo "This requires an NVIDIA GPU and proper NVIDIA Docker runtime configuration."
echo ""
read -p "Build with CUDA support? (y/n): " choice
case "$choice" in
y|Y|yes|YES|Yes )
echo "Selected: Build WITH CUDA support"
# Create or update .env file with the Dockerfile selection
if [ -f .env ]; then
# Update existing file
if grep -q "DOCKER_BACKEND_DOCKERFILE" .env; then
sed -i 's/^DOCKER_BACKEND_DOCKERFILE=.*/DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda/' .env
else
# Add a newline before appending new content
echo "" >> .env
echo "DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda" >> .env
fi
else
# Create new file
echo "DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda" > .env
fi
# Create a flag file to indicate GPU use
echo "GPU" > .gpu_selected
echo "Environment set to build with CUDA support"
;;
* )
echo "Selected: Build WITHOUT CUDA support (CPU only)"
# Create or update .env file with the Dockerfile selection
if [ -f .env ]; then
# Update existing file
if grep -q "DOCKER_BACKEND_DOCKERFILE" .env; then
sed -i 's/^DOCKER_BACKEND_DOCKERFILE=.*/DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend/' .env
else
# Add a newline before appending new content
echo "" >> .env
echo "DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend" >> .env
fi
else
# Create new file
echo "DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend" > .env
fi
# Remove any GPU flag file if it exists
if [ -f .gpu_selected ]; then
rm .gpu_selected
fi
echo "Environment set to build without CUDA support"
;;
esac
echo "=== CUDA Selection Complete ==="