Added CUDA support (#228)
* Add CUDA support - CUDA detection - Memory handling - Ollama model release after training * Fix logging issue added cuda support flag so log accurately reflected cuda toggle * Update llama.cpp rebuild Changed llama.cpp to only check if cuda support is enabled and if so rebuild during the first build rather than each run * Improved vram management Enabled memory pinning and optimizer state offload * Fix CUDA check rewrote llama.cpp rebuild logic, added manual y/n toggle if user wants to enable cuda support * Added fast restart and fixed CUDA check command Added make docker-restart-backend-fast to restart the backend and reflect code changes without causing a full llama.cpp rebuild Fixed make docker-check-cuda command to correctly reflect cuda support * Added docker-compose.gpu.yml Added docker-compose.gpu.yml to fix error on machines without nvidia gpu and made sure "\n" is added before .env modification * Fixed cuda toggle Last push accidentally broke cuda toggle * Code review fixes Fixed errors resulting from removed code: - Added return save_path to end of save_hf_model function - Rolled back download_file_with_progress function * Update Makefile Use cuda by default when using docker-restart-backend-fast * Minor cleanup Removed unnecessary makefile command and fixed gpu logging * Delete .gpu_selected * Simplified cuda training code - Removed dtype setting to let torch automatically handle it - Removed vram logging - Removed Unnecessary/old comments * Fixed gpu/cpu selection Made "make docker-use-gpu/cpu" command work with .gpu_selected flag and changed "make docker-restart-backend-fast" command to respect flag instead of always using gpu * Fix Ollama embedding error Added custom exception class for Ollama embeddings, which seemed to be returning keyword arguments while the Python exception class only accepts positional ones * Fixed model selection & memory error Fixed training defaulting to 0.5B model regardless of selection and fixed "free(): double free detected in tcache 2" error caused by cuda flag being passed incorrectly
This commit is contained in:
parent
71d54a5b0b
commit
053090937d
2
.env
2
.env
|
@ -65,3 +65,5 @@ DOCUMENT_CHUNK_OVERLAP=200
|
|||
|
||||
# Embedding configurations
|
||||
EMBEDDING_MAX_TEXT_LENGTH=3072
|
||||
|
||||
DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda
|
||||
|
|
|
@ -75,3 +75,8 @@ run/*
|
|||
.backend.pid
|
||||
.frontend.pid
|
||||
logs/train/
|
||||
llama_cpp_backup/llama.cpp.zip
|
||||
scripts/check_cuda_status.ps1
|
||||
scripts/test_cuda_detection.bat
|
||||
.env
|
||||
.gpu_selected
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
# Base image selection is handled by the Makefile or build script
|
||||
# For CUDA support: FROM nvidia/cuda:12.8.1-devel-ubuntu22.04
|
||||
# For CPU-only: FROM python:3.12
|
||||
FROM python:3.12
|
||||
|
||||
# Set working directory
|
||||
|
@ -19,28 +22,31 @@ RUN mkdir -p /app/dependencies /app/data/sqlite /app/data/chroma_db /app/logs /a
|
|||
COPY dependencies/graphrag-1.2.1.dev27.tar.gz /app/dependencies/
|
||||
COPY dependencies/llama.cpp.zip /app/dependencies/
|
||||
|
||||
# Build llama.cpp
|
||||
# Copy GPU checker script (only used for status reporting, not rebuilding)
|
||||
COPY docker/app/check_gpu_support.sh /app/
|
||||
COPY docker/app/check_torch_cuda.py /app/
|
||||
RUN chmod +x /app/check_gpu_support.sh
|
||||
|
||||
# Unpack prebuilt llama.cpp CPU binary (no build or GPU detection here)
|
||||
RUN LLAMA_LOCAL_ZIP="dependencies/llama.cpp.zip" \
|
||||
&& echo "Using local llama.cpp archive..." \
|
||||
&& unzip -q "$LLAMA_LOCAL_ZIP" \
|
||||
&& cd llama.cpp \
|
||||
&& mkdir -p build && cd build \
|
||||
&& cmake .. \
|
||||
&& cmake --build . --config Release \
|
||||
&& if [ ! -f "bin/llama-server" ]; then \
|
||||
echo "Build failed: llama-server executable not found" && exit 1; \
|
||||
else \
|
||||
echo "Successfully built llama-server"; \
|
||||
fi
|
||||
&& mkdir -p build \
|
||||
&& cp -r bin build/ 2>/dev/null || true \
|
||||
&& chmod +x /app/llama.cpp/build/bin/llama-server /app/llama.cpp/build/bin/llama-cli 2>/dev/null || true
|
||||
|
||||
# Mark as CPU-only build for runtime reference
|
||||
RUN mkdir -p /app/data && \
|
||||
echo "{ \"gpu_optimized\": false, \"optimized_on\": \"$(date -u +\"%Y-%m-%dT%H:%M:%SZ\")\" }" > /app/data/gpu_optimized.json && \
|
||||
echo "Created CPU-only marker file"
|
||||
|
||||
#
|
||||
# Copy project configuration - Files that occasionally change
|
||||
COPY pyproject.toml README.md /app/
|
||||
|
||||
RUN poetry install --no-interaction --no-root
|
||||
RUN pip install --force-reinstall dependencies/graphrag-1.2.1.dev27.tar.gz
|
||||
|
||||
|
||||
# Copy source code - Files that frequently change
|
||||
COPY docker/ /app/docker/
|
||||
COPY lpm_kernel/ /app/lpm_kernel/
|
||||
|
@ -61,5 +67,5 @@ ENV PYTHONUNBUFFERED=1 \
|
|||
# Expose ports
|
||||
EXPOSE 8002 8080
|
||||
|
||||
# Set the startup command
|
||||
CMD ["bash", "-c", "echo \"Checking SQLite database...\" && if [ ! -s /app/data/sqlite/lpm.db ]; then echo \"SQLite database not found or empty, initializing...\" && mkdir -p /app/data/sqlite && sqlite3 /app/data/sqlite/lpm.db \".read /app/docker/sqlite/init.sql\" && echo \"SQLite database initialized successfully\" && echo \"Tables created:\" && sqlite3 /app/data/sqlite/lpm.db \".tables\"; else echo \"SQLite database already exists, skipping initialization\"; fi && echo \"Checking ChromaDB...\" && if [ ! -d /app/data/chroma_db/documents ] || [ ! -d /app/data/chroma_db/document_chunks ]; then echo \"ChromaDB collections not found, initializing...\" && python /app/docker/app/init_chroma.py && echo \"ChromaDB initialized successfully\"; else echo \"ChromaDB already exists, skipping initialization\"; fi && echo \"Starting application at $(date)\" >> /app/logs/backend.log && cd /app && python -m flask run --host=0.0.0.0 --port=${LOCAL_APP_PORT:-8002} >> /app/logs/backend.log 2>&1"]
|
||||
# Set the startup command - CUDA check/rebuild removed since it's now handled at build time
|
||||
CMD ["bash", "-c", "echo 'Checking SQLite database...' && if [ ! -s /app/data/sqlite/lpm.db ]; then echo 'SQLite database not found or empty, initializing...' && mkdir -p /app/data/sqlite && sqlite3 /app/data/sqlite/lpm.db '.read /app/docker/sqlite/init.sql' && echo 'SQLite database initialized successfully' && echo 'Tables created:' && sqlite3 /app/data/sqlite/lpm.db '.tables'; else echo 'SQLite database already exists, skipping initialization'; fi && echo 'Checking ChromaDB...' && if [ ! -d /app/data/chroma_db/documents ] || [ ! -d /app/data/chroma_db/document_chunks ]; then echo 'ChromaDB collections not found, initializing...' && python /app/docker/app/init_chroma.py && echo 'ChromaDB initialized successfully'; else echo 'ChromaDB already exists, skipping initialization'; fi && echo 'Starting application at ' $(date) >> /app/logs/backend.log && cd /app && python -m flask run --host=0.0.0.0 --port=${LOCAL_APP_PORT:-8002} >> /app/logs/backend.log 2>&1"]
|
||||
|
|
|
@ -0,0 +1,111 @@
|
|||
FROM nvidia/cuda:12.8.1-devel-ubuntu24.04
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Add build argument to conditionally skip llama.cpp build
|
||||
ARG SKIP_LLAMA_BUILD=false
|
||||
|
||||
# Install system dependencies with noninteractive mode to avoid prompts
|
||||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
||||
build-essential cmake git curl wget lsof vim unzip sqlite3 \
|
||||
python3-pip python3-venv python3-full python3-poetry pipx \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& ln -sf /usr/bin/python3 /usr/bin/python
|
||||
|
||||
# Create a virtual environment to avoid PEP 668 restrictions
|
||||
RUN python -m venv /app/venv
|
||||
ENV PATH="/app/venv/bin:$PATH"
|
||||
ENV VIRTUAL_ENV="/app/venv"
|
||||
|
||||
# Use the virtual environment's pip to install packages
|
||||
RUN pip install --upgrade pip \
|
||||
&& pip install poetry \
|
||||
&& poetry config virtualenvs.create false
|
||||
|
||||
# Create directories
|
||||
RUN mkdir -p /app/dependencies /app/data/sqlite /app/data/chroma_db /app/logs /app/run /app/resources
|
||||
|
||||
# Copy dependency files - Files that rarely change
|
||||
COPY dependencies/graphrag-1.2.1.dev27.tar.gz /app/dependencies/
|
||||
COPY dependencies/llama.cpp.zip /app/dependencies/
|
||||
|
||||
# Copy GPU checker script
|
||||
COPY docker/app/check_gpu_support.sh /app/
|
||||
COPY docker/app/check_torch_cuda.py /app/
|
||||
RUN chmod +x /app/check_gpu_support.sh
|
||||
|
||||
# Unpack llama.cpp and build with CUDA support (conditionally, based on SKIP_LLAMA_BUILD)
|
||||
RUN if [ "$SKIP_LLAMA_BUILD" = "false" ]; then \
|
||||
echo "=====================================================================" && \
|
||||
echo "STARTING LLAMA.CPP BUILD WITH CUDA SUPPORT - THIS WILL TAKE SOME TIME" && \
|
||||
echo "=====================================================================" && \
|
||||
LLAMA_LOCAL_ZIP="dependencies/llama.cpp.zip" && \
|
||||
echo "Using local llama.cpp archive..." && \
|
||||
unzip -q "$LLAMA_LOCAL_ZIP" && \
|
||||
cd llama.cpp && \
|
||||
mkdir -p build && \
|
||||
cd build && \
|
||||
echo "Starting CMake configuration with CUDA support..." && \
|
||||
cmake -DGGML_CUDA=ON \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DBUILD_SHARED_LIBS=OFF \
|
||||
-DLLAMA_NATIVE=OFF \
|
||||
-DCMAKE_CUDA_FLAGS="-Wno-deprecated-gpu-targets" \
|
||||
.. && \
|
||||
echo "Starting build process (this will take several minutes)..." && \
|
||||
cmake --build . --config Release -j --verbose && \
|
||||
echo "Build completed successfully" && \
|
||||
chmod +x /app/llama.cpp/build/bin/llama-server /app/llama.cpp/build/bin/llama-cli && \
|
||||
echo "====================================================================" && \
|
||||
echo "CUDA BUILD COMPLETED SUCCESSFULLY! GPU ACCELERATION IS NOW AVAILABLE" && \
|
||||
echo "===================================================================="; \
|
||||
else \
|
||||
echo "=====================================================================" && \
|
||||
echo "SKIPPING LLAMA.CPP BUILD (SKIP_LLAMA_BUILD=$SKIP_LLAMA_BUILD)" && \
|
||||
echo "Using existing llama.cpp build from Docker volume" && \
|
||||
echo "=====================================================================" && \
|
||||
LLAMA_LOCAL_ZIP="dependencies/llama.cpp.zip" && \
|
||||
echo "Just unpacking llama.cpp archive (no build)..." && \
|
||||
unzip -q "$LLAMA_LOCAL_ZIP" && \
|
||||
cd llama.cpp && \
|
||||
mkdir -p build; \
|
||||
fi
|
||||
|
||||
# Mark as GPU-optimized build for runtime reference
|
||||
RUN mkdir -p /app/data && \
|
||||
echo "{ \"gpu_optimized\": true, \"optimized_on\": \"$(date -u +\"%Y-%m-%dT%H:%M:%SZ\")\" }" > /app/data/gpu_optimized.json && \
|
||||
echo "Created GPU-optimized marker file"
|
||||
|
||||
# Copy project configuration - Files that occasionally change
|
||||
COPY pyproject.toml README.md /app/
|
||||
|
||||
# Fix for potential package installation issues with Poetry
|
||||
RUN pip install --upgrade setuptools wheel
|
||||
RUN poetry install --no-interaction --no-root || poetry install --no-interaction --no-root --without dev
|
||||
RUN pip install --force-reinstall dependencies/graphrag-1.2.1.dev27.tar.gz
|
||||
|
||||
# Copy source code - Files that frequently change
|
||||
COPY docker/ /app/docker/
|
||||
COPY lpm_kernel/ /app/lpm_kernel/
|
||||
|
||||
# Check module import
|
||||
RUN python -c "import lpm_kernel; print('Module import check passed')"
|
||||
|
||||
# Set environment variables
|
||||
ENV PYTHONUNBUFFERED=1 \
|
||||
PYTHONPATH=/app \
|
||||
BASE_DIR=/app/data \
|
||||
LOCAL_LOG_DIR=/app/logs \
|
||||
RUN_DIR=/app/run \
|
||||
RESOURCES_DIR=/app/resources \
|
||||
APP_ROOT=/app \
|
||||
FLASK_APP=lpm_kernel.app \
|
||||
LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
|
||||
|
||||
# Expose ports
|
||||
EXPOSE 8002 8080
|
||||
|
||||
# Set the startup command
|
||||
CMD ["bash", "-c", "echo 'Checking SQLite database...' && if [ ! -s /app/data/sqlite/lpm.db ]; then echo 'SQLite database not found or empty, initializing...' && mkdir -p /app/data/sqlite && sqlite3 /app/data/sqlite/lpm.db '.read /app/docker/sqlite/init.sql' && echo 'SQLite database initialized successfully' && echo 'Tables created:' && sqlite3 /app/data/sqlite/lpm.db '.tables'; else echo 'SQLite database already exists, skipping initialization'; fi && echo 'Checking ChromaDB...' && if [ ! -d /app/data/chroma_db/documents ] || [ ! -d /app/data/chroma_db/document_chunks ]; then echo 'ChromaDB collections not found, initializing...' && python /app/docker/app/init_chroma.py && echo 'ChromaDB initialized successfully'; else echo 'ChromaDB already exists, skipping initialization'; fi && echo 'Starting application at ' $(date) >> /app/logs/backend.log && cd /app && python -m flask run --host=0.0.0.0 --port=${LOCAL_APP_PORT:-8002} >> /app/logs/backend.log 2>&1"]
|
141
Makefile
141
Makefile
|
@ -1,4 +1,13 @@
|
|||
.PHONY: install test format lint all setup start stop restart restart-backend restart-force help docker-build docker-up docker-down docker-build-backend docker-build-frontend docker-restart-backend docker-restart-frontend docker-restart-all
|
||||
.PHONY: install test format lint all setup start stop restart restart-backend restart-force help docker-build docker-up docker-down docker-build-backend docker-build-frontend docker-restart-backend docker-restart-backend-fast docker-restart-backend-smart docker-restart-frontend docker-restart-all docker-check-cuda docker-use-gpu docker-use-cpu
|
||||
|
||||
# Check for GPU flag file and set Docker Compose file accordingly
|
||||
ifeq ($(wildcard .gpu_selected),)
|
||||
# No GPU flag file found, use CPU configuration
|
||||
DOCKER_COMPOSE_FILE := docker-compose.yml
|
||||
else
|
||||
# GPU flag file found, use GPU configuration
|
||||
DOCKER_COMPOSE_FILE := docker-compose-gpu.yml
|
||||
endif
|
||||
|
||||
# Detect operating system and set environment
|
||||
ifeq ($(OS),Windows_NT)
|
||||
|
@ -39,6 +48,9 @@ else
|
|||
COLOR_RED := \033[1;31m
|
||||
endif
|
||||
|
||||
# Default Docker Compose configuration (non-GPU)
|
||||
DOCKER_COMPOSE_FILE := docker-compose.yml
|
||||
|
||||
# Show help message
|
||||
help:
|
||||
ifeq ($(WINDOWS),1)
|
||||
|
@ -69,8 +81,12 @@ ifeq ($(WINDOWS),1)
|
|||
@echo make docker-build-backend - Build only backend Docker image
|
||||
@echo make docker-build-frontend - Build only frontend Docker image
|
||||
@echo make docker-restart-backend - Restart only backend container
|
||||
@echo make docker-restart-backend-fast - Restart backend+cuda without rebuilding llama.cpp
|
||||
@echo make docker-restart-frontend - Restart only frontend container
|
||||
@echo make docker-restart-all - Restart all Docker containers
|
||||
@echo make docker-check-cuda - Check CUDA support in containers
|
||||
@echo make docker-use-gpu - Switch to GPU configuration
|
||||
@echo make docker-use-cpu - Switch to CPU-only configuration
|
||||
@echo.
|
||||
@echo All Available Commands:
|
||||
@echo make help - Show this help message
|
||||
|
@ -106,9 +122,13 @@ else
|
|||
@echo " make docker-down - Stop all Docker containers"
|
||||
@echo " make docker-build-backend - Build only backend Docker image"
|
||||
@echo " make docker-build-frontend - Build only frontend Docker image"
|
||||
@echo " make docker-restart-backend - Restart only backend container"
|
||||
@echo " make docker-restart-backend - Restart only backend container (with rebuild)"
|
||||
@echo " make docker-restart-backend-fast - Restart backend+cuda without rebuilding llama.cpp"
|
||||
@echo " make docker-restart-frontend - Restart only frontend container"
|
||||
@echo " make docker-restart-all - Restart all Docker containers"
|
||||
@echo " make docker-check-cuda - Check CUDA support in containers"
|
||||
@echo " make docker-use-gpu - Switch to GPU configuration"
|
||||
@echo " make docker-use-cpu - Switch to CPU-only configuration"
|
||||
@echo ""
|
||||
@echo "$(COLOR_BOLD)All Available Commands:$(COLOR_RESET)"
|
||||
@echo " make help - Show this help message"
|
||||
|
@ -124,6 +144,27 @@ else
|
|||
fi
|
||||
endif
|
||||
|
||||
# Configuration switchers for Docker
|
||||
docker-use-gpu:
|
||||
@echo "Switching to GPU configuration..."
|
||||
ifeq ($(WINDOWS),1)
|
||||
@echo GPU mode enabled. Docker commands will use docker-compose-gpu.yml
|
||||
@echo gpu > .gpu_selected
|
||||
else
|
||||
@echo "$(COLOR_GREEN)GPU mode enabled. Docker commands will use docker-compose-gpu.yml$(COLOR_RESET)"
|
||||
@echo "gpu" > .gpu_selected
|
||||
endif
|
||||
|
||||
docker-use-cpu:
|
||||
@echo "Switching to CPU-only configuration..."
|
||||
ifeq ($(WINDOWS),1)
|
||||
@echo CPU-only mode enabled. Docker commands will use docker-compose.yml
|
||||
@rm -f .gpu_selected
|
||||
else
|
||||
@echo "$(COLOR_GREEN)CPU-only mode enabled. Docker commands will use docker-compose.yml$(COLOR_RESET)"
|
||||
@rm -f .gpu_selected
|
||||
endif
|
||||
|
||||
setup:
|
||||
./scripts/setup.sh
|
||||
|
||||
|
@ -156,37 +197,99 @@ DOCKER_COMPOSE_CMD := $(shell if command -v docker-compose >/dev/null 2>&1; then
|
|||
endif
|
||||
|
||||
docker-build:
|
||||
$(DOCKER_COMPOSE_CMD) build
|
||||
ifeq ($(WINDOWS),1)
|
||||
@echo "Prompting for CUDA preference..."
|
||||
@scripts\prompt_cuda.bat
|
||||
else
|
||||
@echo "Prompting for CUDA preference..."
|
||||
@chmod +x ./scripts/prompt_cuda.sh
|
||||
@./scripts/prompt_cuda.sh
|
||||
endif
|
||||
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build
|
||||
|
||||
docker-up:
|
||||
$(DOCKER_COMPOSE_CMD) up -d
|
||||
@echo "Building and starting Docker containers..."
|
||||
ifeq ($(WINDOWS),1)
|
||||
@echo "Prompting for CUDA preference..."
|
||||
@scripts\prompt_cuda.bat
|
||||
@echo "Checking CUDA preference..."
|
||||
@cmd /c "if exist .gpu_selected ( echo CUDA support detected, using GPU configuration... & docker compose -f docker-compose-gpu.yml build --no-cache & docker compose -f docker-compose-gpu.yml up -d ) else ( echo No CUDA support selected, using CPU-only configuration... & docker compose -f docker-compose.yml build --no-cache & docker compose -f docker-compose.yml up -d )"
|
||||
else
|
||||
@echo "Prompting for CUDA preference..."
|
||||
@chmod +x ./scripts/prompt_cuda.sh
|
||||
@./scripts/prompt_cuda.sh
|
||||
@echo "Checking CUDA preference..."
|
||||
@if [ -f .gpu_selected ]; then \
|
||||
echo "CUDA support detected, using GPU configuration..."; \
|
||||
$(DOCKER_COMPOSE_CMD) -f docker-compose-gpu.yml build; \
|
||||
$(DOCKER_COMPOSE_CMD) -f docker-compose-gpu.yml up -d; \
|
||||
else \
|
||||
echo "No CUDA support selected, using CPU-only configuration..."; \
|
||||
$(DOCKER_COMPOSE_CMD) -f docker-compose.yml build; \
|
||||
$(DOCKER_COMPOSE_CMD) -f docker-compose.yml up -d; \
|
||||
fi
|
||||
endif
|
||||
@echo "Container startup complete"
|
||||
@echo "Check CUDA support with: make docker-check-cuda"
|
||||
|
||||
docker-down:
|
||||
$(DOCKER_COMPOSE_CMD) down
|
||||
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) down
|
||||
|
||||
docker-build-backend:
|
||||
$(DOCKER_COMPOSE_CMD) build backend
|
||||
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build backend
|
||||
|
||||
docker-build-frontend:
|
||||
$(DOCKER_COMPOSE_CMD) build frontend
|
||||
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build frontend
|
||||
|
||||
# Standard backend restart with complete rebuild
|
||||
docker-restart-backend:
|
||||
$(DOCKER_COMPOSE_CMD) stop backend
|
||||
$(DOCKER_COMPOSE_CMD) rm -f backend
|
||||
$(DOCKER_COMPOSE_CMD) build backend || { echo "$(COLOR_RED)❌ Backend build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
|
||||
$(DOCKER_COMPOSE_CMD) up -d backend
|
||||
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) stop backend
|
||||
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) rm -f backend
|
||||
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build backend || { echo "$(COLOR_RED)❌ Backend build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
|
||||
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) up -d backend
|
||||
|
||||
|
||||
# Fast backend restart: preserves llama.cpp build
|
||||
docker-restart-backend-fast:
|
||||
@echo "Smart restarting backend container (preserving llama.cpp build)..."
|
||||
@echo "Stopping backend container..."
|
||||
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) stop backend
|
||||
@echo "Removing backend container..."
|
||||
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) rm -f backend
|
||||
@echo "Building backend image with build-arg to skip llama.cpp build..."
|
||||
ifeq ($(wildcard .gpu_selected),)
|
||||
@echo "Using CPU configuration (docker-compose.yml)..."
|
||||
else
|
||||
@echo "Using GPU configuration (docker-compose-gpu.yml)..."
|
||||
endif
|
||||
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build --build-arg SKIP_LLAMA_BUILD=true backend || { echo "$(COLOR_RED)❌ Backend build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
|
||||
@echo "Starting backend container..."
|
||||
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) up -d backend
|
||||
@echo "Backend container smart-restarted successfully"
|
||||
@echo "Check CUDA support with: make docker-check-cuda"
|
||||
|
||||
docker-restart-frontend:
|
||||
$(DOCKER_COMPOSE_CMD) stop frontend
|
||||
$(DOCKER_COMPOSE_CMD) rm -f frontend
|
||||
$(DOCKER_COMPOSE_CMD) build frontend || { echo "$(COLOR_RED)❌ Frontend build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
|
||||
$(DOCKER_COMPOSE_CMD) up -d frontend
|
||||
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) stop frontend
|
||||
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) rm -f frontend
|
||||
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build frontend || { echo "$(COLOR_RED)❌ Frontend build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
|
||||
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) up -d frontend
|
||||
|
||||
docker-restart-all:
|
||||
$(DOCKER_COMPOSE_CMD) stop
|
||||
$(DOCKER_COMPOSE_CMD) rm -f
|
||||
$(DOCKER_COMPOSE_CMD) build || { echo "$(COLOR_RED)❌ Build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
|
||||
$(DOCKER_COMPOSE_CMD) up -d
|
||||
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) stop
|
||||
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) rm -f
|
||||
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) build || { echo "$(COLOR_RED)❌ Build failed! Aborting operation...$(COLOR_RESET)"; exit 1; }
|
||||
$(DOCKER_COMPOSE_CMD) -f $(DOCKER_COMPOSE_FILE) up -d
|
||||
|
||||
# New command to check CUDA support in containers
|
||||
docker-check-cuda:
|
||||
@echo "Checking CUDA support in Docker containers..."
|
||||
ifeq ($(WINDOWS),1)
|
||||
@echo Running CUDA support check in backend container
|
||||
@docker exec second-me-backend /app/check_gpu_support.sh || echo No GPU support detected in backend container
|
||||
else
|
||||
@echo "$(COLOR_CYAN)Running CUDA support check in backend container:$(COLOR_RESET)"
|
||||
@docker exec second-me-backend /app/check_gpu_support.sh || echo "$(COLOR_RED)No GPU support detected in backend container$(COLOR_RESET)"
|
||||
endif
|
||||
|
||||
install:
|
||||
poetry install
|
||||
|
|
|
@ -0,0 +1,74 @@
|
|||
services:
|
||||
backend:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: ${DOCKER_BACKEND_DOCKERFILE:-Dockerfile.backend.cuda}
|
||||
container_name: second-me-backend
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "8002:8002"
|
||||
- "8080:8080"
|
||||
volumes:
|
||||
- ./data:/app/data
|
||||
- ./logs:/app/logs
|
||||
- ./run:/app/run
|
||||
- ./resources:/app/resources
|
||||
- ./docker:/app/docker
|
||||
- ./.env:/app/.env
|
||||
- llama-cpp-build:/app/llama.cpp/build # Persist the llama.cpp build
|
||||
environment:
|
||||
# Environment variables
|
||||
- LOCAL_APP_PORT=8002
|
||||
- IN_DOCKER_ENV=1
|
||||
- PLATFORM=${PLATFORM:-linux}
|
||||
- USE_CUDA=1
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
# Set container memory limit to 64GB
|
||||
memory: 64G
|
||||
reservations:
|
||||
# Memory reservation
|
||||
memory: 6G
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
networks:
|
||||
- second-me-network
|
||||
|
||||
frontend:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.frontend
|
||||
container_name: second-me-frontend
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "3000:3000"
|
||||
volumes:
|
||||
- ./logs:/app/logs
|
||||
- ./resources:/app/resources
|
||||
environment:
|
||||
- VITE_API_BASE_URL=http://backend:8002
|
||||
depends_on:
|
||||
- backend
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
# Set container memory limit to 2GB
|
||||
memory: 2G
|
||||
reservations:
|
||||
# Memory reservation
|
||||
memory: 1G
|
||||
networks:
|
||||
- second-me-network
|
||||
|
||||
networks:
|
||||
second-me-network:
|
||||
driver: bridge
|
||||
|
||||
volumes:
|
||||
llama-cpp-build:
|
||||
driver: local
|
|
@ -15,6 +15,7 @@ services:
|
|||
- ./resources:/app/resources
|
||||
- ./docker:/app/docker
|
||||
- ./.env:/app/.env
|
||||
- llama-cpp-build:/app/llama.cpp/build # Persist the llama.cpp build
|
||||
environment:
|
||||
# Environment variables
|
||||
- LOCAL_APP_PORT=8002
|
||||
|
@ -62,3 +63,7 @@ services:
|
|||
networks:
|
||||
second-me-network:
|
||||
driver: bridge
|
||||
|
||||
volumes:
|
||||
llama-cpp-build:
|
||||
driver: local
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
#!/bin/bash
|
||||
# Helper script to check if GPU support is available at runtime
|
||||
|
||||
echo "=== GPU Support Check ==="
|
||||
|
||||
# Check if llama-server binary exists and is linked to CUDA libraries
|
||||
if [ -f "/app/llama.cpp/build/bin/llama-server" ]; then
|
||||
echo "llama-server binary found, checking for CUDA linkage..."
|
||||
CUDA_LIBS=$(ldd /app/llama.cpp/build/bin/llama-server | grep -i "cuda\|nvidia")
|
||||
|
||||
if [ -n "$CUDA_LIBS" ]; then
|
||||
echo "✅ llama-server is built with CUDA support:"
|
||||
echo "$CUDA_LIBS"
|
||||
echo "GPU acceleration is available"
|
||||
|
||||
# Check for GPU optimization marker file (optional, not required)
|
||||
GPU_MARKER_FILE="/app/data/gpu_optimized.json"
|
||||
if [ -f "$GPU_MARKER_FILE" ]; then
|
||||
GPU_OPTIMIZED=$(grep -o '"gpu_optimized": *true' "$GPU_MARKER_FILE" || echo "false")
|
||||
OPTIMIZED_DATE=$(grep -o '"optimized_on": *"[^"]*"' "$GPU_MARKER_FILE" | cut -d'"' -f4)
|
||||
|
||||
if [[ "$GPU_OPTIMIZED" == *"true"* ]]; then
|
||||
echo "📝 GPU-optimized build marker found (built on: $OPTIMIZED_DATE)"
|
||||
else
|
||||
echo "📝 GPU marker file found but not marked as optimized (built on: $OPTIMIZED_DATE)"
|
||||
fi
|
||||
else
|
||||
echo "📝 No GPU optimization marker file found, but CUDA support is detected in binary"
|
||||
fi
|
||||
|
||||
# Check if NVIDIA GPU is accessible at runtime
|
||||
if nvidia-smi &>/dev/null; then
|
||||
echo "🔍 NVIDIA GPU is available at runtime"
|
||||
echo "=== GPU ACCELERATION IS READY TO USE ==="
|
||||
exit 0
|
||||
else
|
||||
echo "⚠️ WARNING: llama-server has CUDA support, but NVIDIA GPU is not accessible"
|
||||
echo "Check that Docker is running with GPU access (--gpus all)"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "❌ llama-server is not linked with CUDA libraries"
|
||||
echo "Container was built without CUDA support"
|
||||
fi
|
||||
else
|
||||
echo "❌ llama-server binary not found at /app/llama.cpp/build/bin/llama-server"
|
||||
fi
|
||||
|
||||
# Final check for GPU hardware
|
||||
if nvidia-smi &>/dev/null; then
|
||||
echo "🔍 NVIDIA GPU is available at runtime, but llama-server doesn't support CUDA"
|
||||
echo "To enable GPU support, rebuild using: make docker-up (and select CUDA support when prompted)"
|
||||
exit 1
|
||||
else
|
||||
echo "❌ No NVIDIA GPU detected at runtime"
|
||||
exit 1
|
||||
fi
|
|
@ -0,0 +1,52 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import torch
|
||||
import subprocess
|
||||
import sys
|
||||
import os
|
||||
|
||||
print("=== PyTorch CUDA Version Information ===")
|
||||
print(f"PyTorch version: {torch.__version__}")
|
||||
|
||||
if torch.cuda.is_available():
|
||||
print(f"CUDA available: Yes")
|
||||
print(f"CUDA version used by PyTorch: {torch.version.cuda}")
|
||||
print(f"cuDNN version: {torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else 'Not available'}")
|
||||
print(f"GPU device name: {torch.cuda.get_device_name(0)}")
|
||||
|
||||
# Try to check system CUDA version
|
||||
try:
|
||||
nvcc_output = subprocess.check_output(["nvcc", "--version"]).decode("utf-8")
|
||||
print("\nSystem NVCC version:")
|
||||
print(nvcc_output)
|
||||
except:
|
||||
print("\nNVCC not found in PATH")
|
||||
|
||||
# Check CUDA libraries
|
||||
try:
|
||||
print("\nChecking required CUDA libraries:")
|
||||
for lib in ["libcudart.so", "libcublas.so", "libcublasLt.so"]:
|
||||
print(f"\nSearching for {lib}:")
|
||||
find_result = subprocess.run(f"find /usr -name '{lib}*'", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
if find_result.returncode == 0 and find_result.stdout:
|
||||
print(find_result.stdout.decode("utf-8"))
|
||||
else:
|
||||
print(f"No {lib} found in /usr")
|
||||
except Exception as e:
|
||||
print(f"Error checking libraries: {e}")
|
||||
|
||||
# Check LD_LIBRARY_PATH
|
||||
print("\nLD_LIBRARY_PATH:")
|
||||
print(os.environ.get("LD_LIBRARY_PATH", "Not set"))
|
||||
|
||||
else:
|
||||
print("CUDA not available")
|
||||
|
||||
# Check system CUDA installation
|
||||
print("\n=== System CUDA Information ===")
|
||||
try:
|
||||
nvidia_smi = subprocess.check_output(["nvidia-smi"]).decode("utf-8")
|
||||
print("NVIDIA-SMI output:")
|
||||
print(nvidia_smi)
|
||||
except:
|
||||
print("nvidia-smi not found or not working")
|
|
@ -0,0 +1,132 @@
|
|||
#!/bin/bash
|
||||
# Script to rebuild llama.cpp with CUDA support at runtime
|
||||
# This ensures the build happens with full knowledge of the GPU environment
|
||||
|
||||
set -e # Exit on error but don't print each command (for cleaner logs)
|
||||
cd /app
|
||||
|
||||
echo "========== STARTING LLAMA.CPP CUDA REBUILD PROCESS =========="
|
||||
echo "Current directory: $(pwd)"
|
||||
|
||||
# First check if CUDA is actually available in the container
|
||||
echo "Verifying NVIDIA drivers and CUDA availability..."
|
||||
if ! command -v nvidia-smi &> /dev/null; then
|
||||
echo "WARNING: NVIDIA drivers not found. Cannot build with CUDA support!"
|
||||
echo "Make sure the container has access to the GPU and NVIDIA Container Toolkit is installed."
|
||||
echo "Consider running Docker with: --gpus all"
|
||||
exit 0 # Exit without error as there's no point trying to build with CUDA when no GPU is detected
|
||||
fi
|
||||
|
||||
# Run nvidia-smi to check GPU access
|
||||
echo "Detected NVIDIA GPU:"
|
||||
nvidia-smi || {
|
||||
echo "ERROR: nvidia-smi command failed. GPU is not properly accessible from the container."
|
||||
echo "Make sure you're running Docker with GPU access enabled (--gpus all)"
|
||||
exit 0 # Exit without error since there's no GPU access
|
||||
}
|
||||
|
||||
# Install build dependencies
|
||||
echo "Installing build dependencies..."
|
||||
apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
wget \
|
||||
cmake \
|
||||
git \
|
||||
ca-certificates \
|
||||
gnupg \
|
||||
libopenblas-dev
|
||||
|
||||
# Clean up apt cache to free space
|
||||
apt-get clean
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install CUDA using NVIDIA's official Debian 12 network installation method
|
||||
echo "Installing CUDA using NVIDIA's official method for Debian 12..."
|
||||
wget https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb
|
||||
dpkg -i cuda-keyring_1.1-1_all.deb
|
||||
rm cuda-keyring_1.1-1_all.deb
|
||||
apt-get update
|
||||
|
||||
# Install CUDA packages needed for building llama.cpp with CUDA support
|
||||
apt-get install -y --fix-missing --no-install-recommends cuda-compiler-12-8
|
||||
apt-get clean
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
apt-get update
|
||||
apt-get install -y --fix-missing --no-install-recommends cuda-runtime-12-8
|
||||
apt-get clean
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
apt-get update
|
||||
apt-get install -y --fix-missing --no-install-recommends cuda-libraries-dev-12-8
|
||||
apt-get clean
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set up environment for build
|
||||
export PATH=/usr/local/cuda-12.8/bin:${PATH}
|
||||
export LD_LIBRARY_PATH=/usr/local/cuda-12.8/lib64:${LD_LIBRARY_PATH}
|
||||
export CUDA_HOME=/usr/local/cuda-12.8
|
||||
# Set CUDACXX environment variable explicitly to help CMake find the CUDA compiler
|
||||
export CUDACXX=/usr/local/cuda-12.8/bin/nvcc
|
||||
export CMAKE_CUDA_COMPILER=/usr/local/cuda-12.8/bin/nvcc
|
||||
|
||||
# Verify CUDA compiler is available
|
||||
echo "Verifying CUDA compiler (nvcc) is available:"
|
||||
which nvcc || echo "ERROR: nvcc not found in PATH!"
|
||||
nvcc --version || echo "ERROR: nvcc not working properly!"
|
||||
|
||||
echo "CUDA environment:"
|
||||
echo "- CUDA_HOME: $CUDA_HOME"
|
||||
echo "- CUDACXX: $CUDACXX"
|
||||
echo "- CMAKE_CUDA_COMPILER: $CMAKE_CUDA_COMPILER"
|
||||
echo "- PATH includes CUDA: $PATH"
|
||||
echo "- LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
|
||||
|
||||
# Show available disk space
|
||||
echo "Available disk space:"
|
||||
df -h
|
||||
|
||||
# Use local build approach to avoid volume mount issues
|
||||
echo "Building llama.cpp with CUDA in a local directory..."
|
||||
cd /tmp
|
||||
rm -rf llama_build
|
||||
mkdir -p llama_build
|
||||
cd llama_build
|
||||
|
||||
# Clone a fresh copy of llama.cpp - this avoids volume mount issues
|
||||
echo "Cloning fresh copy of llama.cpp..."
|
||||
git clone https://github.com/ggerganov/llama.cpp.git .
|
||||
|
||||
# Configure and build with CUDA support
|
||||
mkdir -p build
|
||||
cd build
|
||||
echo "Configuring with CMake..."
|
||||
cmake -DGGML_CUDA=ON \
|
||||
-DCMAKE_CUDA_ARCHITECTURES=all \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DBUILD_SHARED_LIBS=OFF \
|
||||
-DLLAMA_NATIVE=OFF \
|
||||
-DCMAKE_CUDA_FLAGS="-Wno-deprecated-gpu-targets" \
|
||||
..
|
||||
|
||||
echo "Building llama.cpp with CUDA support..."
|
||||
cmake --build . --config Release --target all -j $(nproc)
|
||||
|
||||
if [ -f "bin/llama-server" ]; then
|
||||
echo "Build successful! Copying binaries to /app/llama.cpp/build/bin/"
|
||||
mkdir -p /app/llama.cpp/build/bin
|
||||
cp bin/llama-server /app/llama.cpp/build/bin/
|
||||
cp bin/llama-cli /app/llama.cpp/build/bin/ 2>/dev/null || true
|
||||
chmod +x /app/llama.cpp/build/bin/llama-server /app/llama.cpp/build/bin/llama-cli
|
||||
|
||||
# Create GPU optimized marker
|
||||
echo "{ \"gpu_optimized\": true, \"optimized_on\": \"$(date -u +\"%Y-%m-%dT%H:%M:%SZ\")\" }" > /app/data/gpu_optimized.json
|
||||
|
||||
echo "Testing CUDA support in built binary..."
|
||||
LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH /app/llama.cpp/build/bin/llama-server --version
|
||||
echo ""
|
||||
echo "========== CUDA BUILD COMPLETED SUCCESSFULLY =========="
|
||||
else
|
||||
echo "ERROR: Build failed - llama-server executable not found!"
|
||||
exit 1
|
||||
fi
|
|
@ -4,7 +4,7 @@ import { useState, useEffect, useRef } from 'react';
|
|||
import { useRouter } from 'next/navigation';
|
||||
import InfoModal from '@/components/InfoModal';
|
||||
import type { TrainingConfig } from '@/service/train';
|
||||
import { startTrain, stopTrain, retrain, getTrainingParams, resetProgress } from '@/service/train';
|
||||
import { startTrain, stopTrain, retrain, getTrainingParams, checkCudaAvailability, resetProgress } from '@/service/train';
|
||||
import { useTrainingStore } from '@/store/useTrainingStore';
|
||||
import { getMemoryList } from '@/service/memory';
|
||||
import { message, Modal } from 'antd';
|
||||
|
@ -90,6 +90,7 @@ export default function TrainingPage() {
|
|||
const firstLoadRef = useRef<boolean>(true);
|
||||
const pollingStopRef = useRef<boolean>(false);
|
||||
|
||||
const [cudaAvailable, setCudaAvailable] = useState<boolean>(false);
|
||||
const [isResume, setIsResume] = useState(
|
||||
trainingProgress.status === 'suspended' || trainingProgress.status === 'failed'
|
||||
);
|
||||
|
@ -104,6 +105,29 @@ export default function TrainingPage() {
|
|||
fetchModelConfig();
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
// Check CUDA availability once on load
|
||||
checkCudaAvailability()
|
||||
.then(res => {
|
||||
if (res.data.code === 0) {
|
||||
const { cuda_available, cuda_info } = res.data.data;
|
||||
setCudaAvailable(cuda_available);
|
||||
|
||||
if (cuda_available) {
|
||||
console.log('CUDA is available:', cuda_info);
|
||||
} else {
|
||||
console.log('CUDA is not available on this system');
|
||||
}
|
||||
} else {
|
||||
message.error(res.data.message || 'Failed to check CUDA availability');
|
||||
}
|
||||
})
|
||||
.catch(err => {
|
||||
console.error('CUDA availability check failed', err);
|
||||
message.error('CUDA availability check failed');
|
||||
});
|
||||
}, []);
|
||||
|
||||
// Start polling training progress
|
||||
const startPolling = () => {
|
||||
if (pollingStopRef.current) {
|
||||
|
@ -280,39 +304,23 @@ export default function TrainingPage() {
|
|||
const eventSource = new EventSource('/api/trainprocess/logs');
|
||||
|
||||
eventSource.onmessage = (event) => {
|
||||
try {
|
||||
const data = JSON.parse(event.data);
|
||||
// Don't try to parse as JSON, just use the raw text data directly
|
||||
const logMessage = event.data;
|
||||
|
||||
setTrainingDetails((prev) => {
|
||||
const newLogs = [
|
||||
...prev.slice(-500), // Keep more log entries (500 instead of 100)
|
||||
{
|
||||
message: logMessage,
|
||||
timestamp: new Date().toLocaleTimeString([], { hour: '2-digit', minute: '2-digit', second: '2-digit' })
|
||||
}
|
||||
];
|
||||
|
||||
setTrainingDetails((prev) => {
|
||||
const newLogs = [
|
||||
...prev.slice(-100),
|
||||
{
|
||||
message: data.message,
|
||||
timestamp: new Date().toISOString()
|
||||
}
|
||||
];
|
||||
// Save logs to localStorage for persistence between page refreshes
|
||||
localStorage.setItem('trainingLogs', JSON.stringify(newLogs));
|
||||
|
||||
// Save logs to localStorage
|
||||
// localStorage.setItem('trainingLogs', JSON.stringify(newLogs));
|
||||
|
||||
return newLogs;
|
||||
});
|
||||
} catch {
|
||||
setTrainingDetails((prev) => {
|
||||
const newLogs = [
|
||||
...prev.slice(-100),
|
||||
{
|
||||
message: event.data,
|
||||
timestamp: new Date().toISOString()
|
||||
}
|
||||
];
|
||||
|
||||
// Save logs to localStorage
|
||||
// localStorage.setItem('trainingLogs', JSON.stringify(newLogs));
|
||||
|
||||
return newLogs;
|
||||
});
|
||||
}
|
||||
return newLogs;
|
||||
});
|
||||
};
|
||||
|
||||
eventSource.onerror = (error) => {
|
||||
|
@ -535,6 +543,7 @@ export default function TrainingPage() {
|
|||
trainActionLoading={trainActionLoading}
|
||||
trainingParams={trainingParams}
|
||||
updateTrainingParams={updateTrainingParams}
|
||||
cudaAvailable={cudaAvailable}
|
||||
/>
|
||||
|
||||
{/* Only show training progress after training starts */}
|
||||
|
|
|
@ -26,6 +26,14 @@ interface ModelConfig {
|
|||
[key: string]: any;
|
||||
}
|
||||
|
||||
interface TrainingParams {
|
||||
data_synthesis_mode: string;
|
||||
learning_rate: number;
|
||||
number_of_epochs: number;
|
||||
concurrency_threads: number;
|
||||
use_cuda: boolean;
|
||||
}
|
||||
|
||||
interface TrainingConfigurationProps {
|
||||
baseModelOptions: BaseModelOption[];
|
||||
modelConfig: ModelConfig | null;
|
||||
|
@ -40,6 +48,7 @@ interface TrainingConfigurationProps {
|
|||
trainActionLoading: boolean;
|
||||
setSelectedInfo: React.Dispatch<React.SetStateAction<boolean>>;
|
||||
trainingParams: TrainingConfig;
|
||||
cudaAvailable: boolean;
|
||||
}
|
||||
|
||||
const synthesisModeOptions = [
|
||||
|
@ -61,7 +70,8 @@ const TrainingConfiguration: React.FC<TrainingConfigurationProps> = ({
|
|||
changeBaseModel,
|
||||
trainActionLoading,
|
||||
handleTrainingAction,
|
||||
setSelectedInfo
|
||||
setSelectedInfo,
|
||||
cudaAvailable
|
||||
}) => {
|
||||
const [disabledChangeParams, setDisabledChangeParams] = useState<boolean>(false);
|
||||
const [openThinkingModel, setOpenThinkingModel] = useState<boolean>(false);
|
||||
|
@ -394,6 +404,47 @@ const TrainingConfiguration: React.FC<TrainingConfigurationProps> = ({
|
|||
Enter an integer between 1 and 10 (recommended: 2)
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="flex flex-col gap-2 mt-4">
|
||||
<div className="flex gap-3 items-center">
|
||||
<div className="font-medium">Enable CUDA GPU Acceleration</div>
|
||||
<Tooltip title="When enabled, training will use CUDA GPU acceleration if available on your system. This can significantly speed up training but requires compatible NVIDIA hardware and drivers.">
|
||||
<QuestionCircleOutlined className="cursor-pointer" />
|
||||
</Tooltip>
|
||||
</div>
|
||||
<div className="flex items-center">
|
||||
<label className="inline-flex items-center cursor-pointer">
|
||||
<input
|
||||
type="checkbox"
|
||||
checked={
|
||||
disabledChangeParams && nowTrainingParams && !changeBaseModel
|
||||
? nowTrainingParams.use_cuda
|
||||
: trainingParams.use_cuda
|
||||
}
|
||||
className="sr-only peer"
|
||||
disabled={disabledChangeParams || !cudaAvailable}
|
||||
onChange={(e) => {
|
||||
updateTrainingParams({ ...trainingParams, use_cuda: e.target.checked });
|
||||
}}
|
||||
/>
|
||||
<div className={`relative w-11 h-6 ${!cudaAvailable ? 'bg-gray-300' : 'bg-gray-200'} peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-blue-300 rounded-full peer peer-checked:after:translate-x-full rtl:peer-checked:after:-translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:start-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all ${cudaAvailable ? 'peer-checked:bg-blue-600' : 'peer-checked:bg-gray-400'}`}></div>
|
||||
<span className={`ms-3 text-sm font-medium ${!cudaAvailable ? 'text-gray-500' : 'text-gray-700'}`}>
|
||||
{disabledChangeParams && nowTrainingParams && !changeBaseModel
|
||||
? nowTrainingParams.use_cuda
|
||||
? 'Enabled'
|
||||
: 'Disabled'
|
||||
: trainingParams.use_cuda
|
||||
? 'Enabled'
|
||||
: 'Disabled'}
|
||||
</span>
|
||||
</label>
|
||||
</div>
|
||||
<div className="text-xs text-gray-500">
|
||||
{cudaAvailable
|
||||
? 'Enable for faster training on NVIDIA GPUs.'
|
||||
: 'CUDA acceleration is not available on this system.'}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@ const TrainingLog: React.FC<TrainingLogProps> = ({ trainingDetails }: TrainingLo
|
|||
const consoleEndRef = useRef<HTMLDivElement>(null);
|
||||
const [isUserScrolling, setIsUserScrolling] = useState(false);
|
||||
const userScrollTimeout = useRef<NodeJS.Timeout | null>(null);
|
||||
const [isAutoScrollEnabled, setIsAutoScrollEnabled] = useState(true);
|
||||
|
||||
// Smooth scroll console to bottom
|
||||
const smoothScrollConsole = () => {
|
||||
|
@ -29,17 +30,37 @@ const TrainingLog: React.FC<TrainingLogProps> = ({ trainingDetails }: TrainingLo
|
|||
useEffect(() => {
|
||||
// Set up scroll event listener to detect user scrolling
|
||||
const handleUserScroll = () => {
|
||||
setIsUserScrolling(true);
|
||||
if (!consoleEndRef.current) return;
|
||||
|
||||
const consoleContainer = consoleEndRef.current.closest('.overflow-y-auto');
|
||||
|
||||
if (!(consoleContainer instanceof HTMLElement)) return;
|
||||
|
||||
// Check if scrolled away from bottom
|
||||
const isScrolledToBottom =
|
||||
Math.abs((consoleContainer.scrollHeight - consoleContainer.scrollTop) - consoleContainer.clientHeight) < 50;
|
||||
|
||||
// If scrolled away from bottom, consider it manual scrolling
|
||||
if (!isScrolledToBottom) {
|
||||
setIsUserScrolling(true);
|
||||
|
||||
// Clear any existing timeout
|
||||
if (userScrollTimeout.current) {
|
||||
clearTimeout(userScrollTimeout.current);
|
||||
}
|
||||
// Clear any existing timeout
|
||||
if (userScrollTimeout.current) {
|
||||
clearTimeout(userScrollTimeout.current);
|
||||
}
|
||||
|
||||
// Reset the flag after a short delay
|
||||
userScrollTimeout.current = setTimeout(() => {
|
||||
// Reset the flag after a delay
|
||||
userScrollTimeout.current = setTimeout(() => {
|
||||
setIsUserScrolling(false);
|
||||
}, 5000); // 5 seconds delay before allowing auto-scroll again
|
||||
} else {
|
||||
// If at bottom, not considered manual scrolling
|
||||
setIsUserScrolling(false);
|
||||
}, 2000); // 2 seconds delay before allowing auto-scroll again
|
||||
if (userScrollTimeout.current) {
|
||||
clearTimeout(userScrollTimeout.current);
|
||||
userScrollTimeout.current = null;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Find the console container and attach the scroll listener
|
||||
|
@ -65,7 +86,16 @@ const TrainingLog: React.FC<TrainingLogProps> = ({ trainingDetails }: TrainingLo
|
|||
if (trainingDetails.length > 0) {
|
||||
smoothScrollConsole();
|
||||
}
|
||||
}, [trainingDetails]);
|
||||
}, [trainingDetails, isAutoScrollEnabled]);
|
||||
|
||||
const toggleAutoScroll = () => {
|
||||
setIsAutoScrollEnabled(!isAutoScrollEnabled);
|
||||
if (!isAutoScrollEnabled) {
|
||||
// If we're re-enabling auto-scroll, scroll to bottom immediately
|
||||
setIsUserScrolling(false);
|
||||
setTimeout(smoothScrollConsole, 50);
|
||||
}
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="mt-4">
|
||||
|
|
|
@ -138,3 +138,17 @@ export const getTrainingParams = () => {
|
|||
url: `/api/trainprocess/training_params`
|
||||
});
|
||||
};
|
||||
|
||||
export const checkCudaAvailability = () => {
|
||||
return Request<CommonResponse<{
|
||||
cuda_available: boolean;
|
||||
cuda_info: {
|
||||
device_count?: number;
|
||||
current_device?: number;
|
||||
device_name?: string;
|
||||
};
|
||||
}>>({
|
||||
method: 'get',
|
||||
url: '/api/kernel2/cuda/available'
|
||||
});
|
||||
};
|
||||
|
|
|
@ -169,6 +169,18 @@ Domain Timelines:
|
|||
content = response.choices[0].message.content
|
||||
shift_pattern = r"\{.*\}"
|
||||
shift_perspective_result = self.__parse_json_response(content, shift_pattern)
|
||||
|
||||
# Check if result is None and provide default values to avoid TypeError
|
||||
if shift_perspective_result is None:
|
||||
logger.warning(f"Failed to parse perspective shift result, using default values: {content}")
|
||||
# Create a default mapping with expected parameters
|
||||
shift_perspective_result = {
|
||||
"domainDesc": f"You have knowledge and experience related to {shade_info.name}.",
|
||||
"domainContent": shade_info.content_third_view,
|
||||
"domainTimeline": []
|
||||
}
|
||||
|
||||
# Now it's safe to pass shift_perspective_result as kwargs
|
||||
shade_info.add_second_view(**shift_perspective_result)
|
||||
return shade_info
|
||||
|
||||
|
|
|
@ -53,8 +53,18 @@ class PreferenceQAGenerator:
|
|||
bio: Biography or context information to use in prompt generation.
|
||||
preference_language: Language for prompts ("Chinese/中文" or otherwise English).
|
||||
"""
|
||||
# Ensure the filename is actually a string
|
||||
if filename is None:
|
||||
raise ValueError("Filename cannot be None")
|
||||
|
||||
self.filename = filename
|
||||
self.is_cot = is_cot
|
||||
# Convert is_cot to bool if it's a string
|
||||
if isinstance(is_cot, str):
|
||||
self.is_cot = is_cot.lower() == 'true'
|
||||
else:
|
||||
self.is_cot = bool(is_cot)
|
||||
|
||||
logger.info(f"PreferenceQAGenerator initialized with is_cot={self.is_cot}")
|
||||
|
||||
with open(self.filename, "r", encoding="utf-8") as f:
|
||||
self.pre_msg = json.load(f)
|
||||
|
|
|
@ -1 +1,12 @@
|
|||
python lpm_kernel/L2/utils.py Qwen2.5-0.5B-Instruct
|
||||
#!/bin/bash
|
||||
# Script to download model from Hugging Face
|
||||
# Usage: ./download_model.sh [model_name]
|
||||
# If no model name is provided, will attempt to get it from config
|
||||
|
||||
if [ "$1" != "" ]; then
|
||||
# Use provided model name
|
||||
python lpm_kernel/L2/utils.py "$1"
|
||||
else
|
||||
# No model name provided, let utils.py determine from config
|
||||
python lpm_kernel/L2/utils.py
|
||||
fi
|
||||
|
|
|
@ -35,11 +35,19 @@ class L2Generator:
|
|||
Args:
|
||||
data_path: Path to the raw data directory. Defaults to "../raw_data".
|
||||
preferred_lang: Preferred language for data processing. Defaults to "English".
|
||||
is_cot: Whether to use Chain of Thought reasoning. Can be bool or string.
|
||||
"""
|
||||
self.data_path = data_path
|
||||
self.data_processor = L2DataProcessor(data_path, preferred_lang)
|
||||
self.preferred_lang = preferred_lang
|
||||
self.is_cot = is_cot
|
||||
|
||||
# Convert is_cot to bool if it's a string
|
||||
if isinstance(is_cot, str):
|
||||
self.is_cot = is_cot.lower() == 'true'
|
||||
else:
|
||||
self.is_cot = bool(is_cot)
|
||||
|
||||
logging.info(f"L2Generator initialized with is_cot={self.is_cot}")
|
||||
|
||||
def data_preprocess(self, note_list: List[Note], basic_info: Dict):
|
||||
"""Preprocess the input notes and basic information.
|
||||
|
@ -60,39 +68,50 @@ class L2Generator:
|
|||
graph_path: str,
|
||||
config_path: str,
|
||||
):
|
||||
"""Generate subjective data based on input notes and user information.
|
||||
"""Generate subjective data for personalization.
|
||||
|
||||
This method orchestrates the generation of subjective data including preferences,
|
||||
diversity, self-Q&A data, and graph indexing.
|
||||
|
||||
Args:
|
||||
note_list: List of Note objects.
|
||||
basic_info: Dictionary containing basic user information.
|
||||
note_list: List of Note objects to process.
|
||||
basic_info: Dictionary containing user information.
|
||||
data_output_base_dir: Base directory for output data.
|
||||
topics_path: Path to topics data.
|
||||
entities_path: Path to entities data.
|
||||
entities_path: Path to entity data.
|
||||
graph_path: Path to graph data.
|
||||
config_path: Path to configuration file.
|
||||
"""
|
||||
global_bio = basic_info["globalBio"]
|
||||
user_name = basic_info["username"]
|
||||
user_intro = basic_info["aboutMe"]
|
||||
if not os.path.exists(data_output_base_dir):
|
||||
os.makedirs(data_output_base_dir)
|
||||
|
||||
preference_output_path = "preference.json"
|
||||
diversity_output_path = "diversity.json"
|
||||
selfqa_output_path = "selfqa.json"
|
||||
# Check if the file exists
|
||||
if not os.path.exists(topics_path):
|
||||
# Create an empty file
|
||||
with open(topics_path, "w") as f:
|
||||
f.write(json.dumps([]))
|
||||
|
||||
# Generate subjective data
|
||||
self.data_processor.gen_subjective_data(
|
||||
note_list,
|
||||
data_output_base_dir,
|
||||
preference_output_path,
|
||||
diversity_output_path,
|
||||
selfqa_output_path,
|
||||
global_bio,
|
||||
topics_path,
|
||||
entities_path,
|
||||
graph_path,
|
||||
user_name,
|
||||
config_path,
|
||||
user_intro,
|
||||
note_list=note_list,
|
||||
data_output_base_dir=data_output_base_dir,
|
||||
preference_output_path="preference.json",
|
||||
diversity_output_path="diversity.json",
|
||||
selfqa_output_path="selfqa.json",
|
||||
global_bio=basic_info["globalBio"],
|
||||
topics_path=topics_path,
|
||||
entitys_path=entities_path,
|
||||
graph_path=graph_path,
|
||||
user_name=basic_info["username"],
|
||||
config_path=config_path,
|
||||
user_intro=basic_info["aboutMe"],
|
||||
)
|
||||
|
||||
# Merge JSON files for training
|
||||
self.merge_json_files(data_output_base_dir)
|
||||
|
||||
# Release Ollama models from memory after data synthesis is complete
|
||||
self._release_ollama_models()
|
||||
|
||||
def gen_preference_data(
|
||||
self,
|
||||
|
@ -187,6 +206,20 @@ class L2Generator:
|
|||
with open(merged_output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(merged_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
def _release_ollama_models(self):
|
||||
"""Release Ollama models from memory to free up VRAM for training.
|
||||
|
||||
This method calls the release function defined in the train module.
|
||||
It's important to release models after data synthesis and before training
|
||||
to ensure VRAM is properly freed.
|
||||
"""
|
||||
try:
|
||||
from lpm_kernel.L2.train import release_ollama_models
|
||||
release_ollama_models()
|
||||
except Exception as e:
|
||||
import logging
|
||||
logging = logging.getLogger(__name__)
|
||||
logging.warning(f"Failed to release Ollama models: {str(e)}")
|
||||
|
||||
def clean_graphrag_keys(self):
|
||||
GRAPH_CONFIG = os.path.join(
|
||||
|
|
|
@ -0,0 +1,149 @@
|
|||
"""Memory management utilities for PyTorch training.
|
||||
|
||||
This module provides lightweight utilities to monitor memory usage
|
||||
and configure PyTorch's built-in memory management features.
|
||||
"""
|
||||
|
||||
import os
|
||||
import gc
|
||||
import logging
|
||||
import psutil
|
||||
import torch
|
||||
from typing import Dict, Any
|
||||
|
||||
# Configure logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class MemoryManager:
|
||||
"""Simple memory manager that leverages PyTorch's built-in memory optimizations."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the memory manager."""
|
||||
self.cuda_available = torch.cuda.is_available()
|
||||
self.process = psutil.Process(os.getpid())
|
||||
|
||||
# Remove redundant environment variable setting - now handled in train_for_user.sh
|
||||
|
||||
def get_memory_info(self) -> Dict[str, Any]:
|
||||
"""Get current memory usage information."""
|
||||
info = {
|
||||
"ram_used_percent": psutil.virtual_memory().percent,
|
||||
"ram_used_gb": psutil.virtual_memory().used / (1024**3),
|
||||
"ram_available_gb": psutil.virtual_memory().available / (1024**3),
|
||||
"ram_total_gb": psutil.virtual_memory().total / (1024**3),
|
||||
}
|
||||
|
||||
if self.cuda_available:
|
||||
try:
|
||||
info.update({
|
||||
"vram_used_gb": torch.cuda.memory_allocated() / (1024**3),
|
||||
"vram_reserved_gb": torch.cuda.memory_reserved() / (1024**3),
|
||||
"vram_total_gb": torch.cuda.get_device_properties(0).total_memory / (1024**3),
|
||||
})
|
||||
except RuntimeError as e:
|
||||
logger.warning(f"Error getting CUDA memory info: {str(e)}")
|
||||
self.cuda_available = False
|
||||
|
||||
return info
|
||||
|
||||
def cleanup_memory(self, force: bool = False) -> None:
|
||||
"""Free up memory by garbage collection and emptying CUDA cache."""
|
||||
# Run Python garbage collection
|
||||
gc.collect()
|
||||
|
||||
# Empty CUDA cache if available
|
||||
if self.cuda_available:
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# Log memory status after cleanup
|
||||
if force:
|
||||
info = self.get_memory_info()
|
||||
logger.info(
|
||||
f"Memory after cleanup: RAM: {info['ram_used_gb']:.2f}GB / {info['ram_total_gb']:.2f}GB, "
|
||||
f"VRAM: {info.get('vram_used_gb', 0):.2f}GB / {info.get('vram_total_gb', 0):.2f}GB"
|
||||
)
|
||||
|
||||
def get_optimal_training_config(self) -> Dict[str, Any]:
|
||||
"""Get recommended configurations for model training based on hardware capabilities."""
|
||||
# Default configs that rely on PyTorch's automatic memory management
|
||||
config = {
|
||||
"device_map": "auto",
|
||||
"fp16": False,
|
||||
"bf16": False,
|
||||
"gradient_checkpointing": True,
|
||||
"gradient_accumulation_steps": 1,
|
||||
}
|
||||
|
||||
# Enable mixed precision based on hardware support
|
||||
if self.cuda_available:
|
||||
capability = torch.cuda.get_device_capability()
|
||||
if capability[0] >= 8: # Ampere or newer (supports BF16)
|
||||
config["bf16"] = True
|
||||
elif capability[0] >= 7: # Volta or newer (supports FP16)
|
||||
config["fp16"] = True
|
||||
|
||||
# Adjust accumulation steps based on available memory
|
||||
vram_gb = self.get_memory_info().get("vram_total_gb", 0)
|
||||
if vram_gb < 8: # Small GPUs
|
||||
config["gradient_accumulation_steps"] = 4
|
||||
elif vram_gb < 16: # Medium GPUs
|
||||
config["gradient_accumulation_steps"] = 2
|
||||
|
||||
return config
|
||||
|
||||
def optimize_model_for_training(self, model):
|
||||
"""Apply PyTorch's built-in memory optimizations for training."""
|
||||
# Enable gradient checkpointing if available
|
||||
if hasattr(model, "gradient_checkpointing_enable"):
|
||||
logger.info("Enabling gradient checkpointing for memory efficiency")
|
||||
model.gradient_checkpointing_enable()
|
||||
|
||||
# Enable memory-efficient attention for PyTorch 2.0+
|
||||
if hasattr(model, "config"):
|
||||
try:
|
||||
model.config.use_memory_efficient_attention = True
|
||||
except:
|
||||
pass
|
||||
|
||||
# Enable flash attention for compatible GPUs
|
||||
if self.cuda_available and torch.cuda.get_device_capability()[0] >= 8:
|
||||
try:
|
||||
model.config.attn_implementation = "flash_attention_2"
|
||||
except:
|
||||
pass
|
||||
|
||||
return model
|
||||
|
||||
def optimize_training_args(self, training_args):
|
||||
"""Configure training arguments for efficient memory usage."""
|
||||
if not training_args:
|
||||
return None
|
||||
|
||||
# Get optimal configuration based on hardware
|
||||
config = self.get_optimal_training_config()
|
||||
|
||||
# Apply configurations to training arguments
|
||||
if not getattr(training_args, "fp16", False) and not getattr(training_args, "bf16", False):
|
||||
training_args.fp16 = config["fp16"]
|
||||
training_args.bf16 = config["bf16"]
|
||||
|
||||
if not getattr(training_args, "gradient_checkpointing", False):
|
||||
training_args.gradient_checkpointing = config["gradient_checkpointing"]
|
||||
|
||||
if training_args.gradient_accumulation_steps == 1:
|
||||
training_args.gradient_accumulation_steps = config["gradient_accumulation_steps"]
|
||||
|
||||
logger.info("Training configuration optimized for memory efficiency:")
|
||||
logger.info(f" Mixed precision: FP16={training_args.fp16}, BF16={training_args.bf16}")
|
||||
logger.info(f" Gradient checkpointing: {training_args.gradient_checkpointing}")
|
||||
logger.info(f" Gradient accumulation steps: {training_args.gradient_accumulation_steps}")
|
||||
|
||||
return training_args
|
||||
|
||||
|
||||
# Global memory manager instance
|
||||
memory_manager = MemoryManager()
|
||||
|
||||
def get_memory_manager() -> MemoryManager:
|
||||
"""Get the global memory manager instance."""
|
||||
return memory_manager
|
|
@ -6,36 +6,123 @@ LoRA architecture during inference.
|
|||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import gc
|
||||
import sys
|
||||
import logging
|
||||
import traceback
|
||||
import torch
|
||||
import datetime
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
from peft import PeftModel
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
import logging
|
||||
from lpm_kernel.L2.memory_manager import get_memory_manager
|
||||
|
||||
# Configure logging
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
def merge_lora_weights(base_model_path, lora_adapter_path, output_model_path):
|
||||
"""Merge LoRA weights into a base model and save the result.
|
||||
|
||||
This function loads a base model and a LoRA adapter, merges them together,
|
||||
and saves the resulting model to the specified output path.
|
||||
and saves the resulting model to the specified output path. It leverages
|
||||
PyTorch's built-in memory management features.
|
||||
|
||||
Args:
|
||||
base_model_path: Path to the base model directory.
|
||||
lora_adapter_path: Path to the LoRA adapter directory.
|
||||
output_model_path: Path where the merged model will be saved.
|
||||
"""
|
||||
# Load the base model
|
||||
logging.info(f"Loading base model from {base_model_path}")
|
||||
base_model = AutoModelForCausalLM.from_pretrained(base_model_path)
|
||||
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
|
||||
|
||||
# Load the LoRA adapter and apply it to the base model
|
||||
lora_model = PeftModel.from_pretrained(base_model, lora_adapter_path)
|
||||
|
||||
# Merge LoRA weights into the base model
|
||||
merged_model = lora_model.merge_and_unload()
|
||||
|
||||
# Save the merged model and tokenizer
|
||||
merged_model.save_pretrained(output_model_path)
|
||||
tokenizer.save_pretrained(output_model_path)
|
||||
# Get memory manager
|
||||
memory_manager = get_memory_manager()
|
||||
|
||||
try:
|
||||
# Log initial memory state
|
||||
memory_info = memory_manager.get_memory_info()
|
||||
logger.info(f"Initial memory state: RAM used: {memory_info['ram_used_gb']:.2f}GB, "
|
||||
f"available: {memory_info['ram_available_gb']:.2f}GB")
|
||||
|
||||
# Determine if CUDA is available and should be used
|
||||
use_cuda = memory_manager.cuda_available
|
||||
device = "cuda" if use_cuda else "cpu"
|
||||
|
||||
if use_cuda:
|
||||
logger.info(f"CUDA is available. VRAM used: {memory_info.get('vram_used_gb', 0):.2f}GB")
|
||||
else:
|
||||
logger.warning("CUDA not available or not enabled. Using CPU for model operations.")
|
||||
|
||||
# Clean up memory before starting
|
||||
memory_manager.cleanup_memory(force=True)
|
||||
|
||||
# Explicitly set device configuration based on available hardware
|
||||
device_map = "auto" if use_cuda else None
|
||||
dtype = torch.float16 if use_cuda else torch.float32
|
||||
|
||||
logger.info(f"Loading base model from {base_model_path} with device_map={device_map}, dtype={dtype}")
|
||||
|
||||
# Use explicit configuration for GPU utilization
|
||||
base_model = AutoModelForCausalLM.from_pretrained(
|
||||
base_model_path,
|
||||
torch_dtype=dtype,
|
||||
device_map=device_map
|
||||
)
|
||||
|
||||
# Load tokenizer - this doesn't consume much memory
|
||||
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
|
||||
|
||||
# Load the LoRA adapter and apply it to the base model
|
||||
logger.info(f"Loading LoRA adapter from {lora_adapter_path}")
|
||||
lora_model = PeftModel.from_pretrained(base_model, lora_adapter_path)
|
||||
|
||||
# Merge weights - this is done automatically by PyTorch on appropriate devices
|
||||
logger.info(f"Merging LoRA weights into base model on {device}")
|
||||
merged_model = lora_model.merge_and_unload()
|
||||
|
||||
# Clean up before saving
|
||||
memory_manager.cleanup_memory()
|
||||
|
||||
# Add inference optimization config to the merged model for faster startup
|
||||
if use_cuda:
|
||||
# Set inference-specific configuration in model config
|
||||
if hasattr(merged_model.config, "torch_dtype"):
|
||||
merged_model.config.torch_dtype = "float16" # Prefer float16 for inference
|
||||
if not hasattr(merged_model.config, "pretraining_tp"):
|
||||
merged_model.config.pretraining_tp = 1 # For tensor parallelism during inference
|
||||
|
||||
# Set default inference device
|
||||
if not hasattr(merged_model.config, "_default_inference_device"):
|
||||
merged_model.config._default_inference_device = "cuda:0"
|
||||
|
||||
logger.info("Added GPU optimization settings to model configuration")
|
||||
|
||||
# Save merged model with shard size to prevent OOM errors during save
|
||||
logger.info(f"Saving merged model to {output_model_path}")
|
||||
merged_model.save_pretrained(
|
||||
output_model_path,
|
||||
safe_serialization=True,
|
||||
max_shard_size="2GB" # Sharded saving to avoid memory spikes
|
||||
)
|
||||
tokenizer.save_pretrained(output_model_path)
|
||||
|
||||
# Save a special marker file to indicate this model should use GPU for inference
|
||||
if use_cuda:
|
||||
with open(os.path.join(output_model_path, "gpu_optimized.json"), "w") as f:
|
||||
import json
|
||||
json.dump({"gpu_optimized": True, "optimized_on": datetime.datetime.now().isoformat()}, f)
|
||||
logger.info("Added GPU optimization marker file for faster service startup")
|
||||
|
||||
logger.info("Model successfully merged and saved!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during model merge: {str(e)}")
|
||||
logger.error(traceback.format_exc())
|
||||
# Force cleanup
|
||||
gc.collect()
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
raise
|
||||
|
||||
|
||||
def merge_model_weights(
|
||||
|
|
|
@ -28,10 +28,13 @@ from lpm_kernel.L2.utils import (
|
|||
create_and_prepare_model,
|
||||
formatting_prompts_func,
|
||||
create_chat_data,
|
||||
release_ollama_models_early,
|
||||
)
|
||||
from lpm_kernel.configs.logging import LOGGING_CONFIG
|
||||
import logging.config
|
||||
from lpm_kernel.configs.logging import get_train_process_logger
|
||||
from lpm_kernel.L2.memory_manager import get_memory_manager
|
||||
|
||||
logger = get_train_process_logger()
|
||||
|
||||
|
||||
|
@ -45,6 +48,25 @@ class LogTqdm(tqdm):
|
|||
# Replace the default tqdm
|
||||
sys.modules["tqdm"].tqdm = LogTqdm
|
||||
|
||||
# Debug callback for logging training progress
|
||||
class DebugCallback(transformers.TrainerCallback):
|
||||
def __init__(self):
|
||||
self.total_time = 0
|
||||
self.last_time = time.time()
|
||||
|
||||
def on_step_end(self, args, state, control, **kwargs):
|
||||
if state.global_step % 10 == 0:
|
||||
current_time = time.time()
|
||||
step_time = current_time - self.last_time
|
||||
self.total_time += step_time
|
||||
self.last_time = current_time
|
||||
|
||||
# Log step time and training progress
|
||||
logger.info(f"Step {state.global_step}: {step_time:.2f}s - Total training time: {self.total_time:.2f}s")
|
||||
|
||||
def on_epoch_end(self, args, state, control, **kwargs):
|
||||
logger.info(f"Epoch {state.epoch} completed")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelArguments:
|
||||
|
@ -112,6 +134,10 @@ class ModelArguments:
|
|||
default=False,
|
||||
metadata={"help": "Enables UnSloth for training."},
|
||||
)
|
||||
use_cuda: Optional[bool] = field(
|
||||
default=False,
|
||||
metadata={"help": "Enables CUDA GPU acceleration for training and inference when available."},
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -162,80 +188,117 @@ def main(model_args, data_args, training_args):
|
|||
for handler in logging.getLogger().handlers:
|
||||
handler.flush()
|
||||
|
||||
logger.info("start 1")
|
||||
set_seed(training_args.seed)
|
||||
logger.info("start 2")
|
||||
# model
|
||||
model, peft_config, tokenizer = create_and_prepare_model(
|
||||
model_args, data_args, training_args
|
||||
)
|
||||
logger.info("start 3")
|
||||
# gradient ckpt
|
||||
model.config.use_cache = not training_args.gradient_checkpointing
|
||||
training_args.gradient_checkpointing = (
|
||||
training_args.gradient_checkpointing and not model_args.use_unsloth
|
||||
)
|
||||
logger.info("start 4")
|
||||
if training_args.gradient_checkpointing:
|
||||
training_args.gradient_checkpointing_kwargs = {
|
||||
"use_reentrant": model_args.use_reentrant
|
||||
}
|
||||
|
||||
# Configure system resources for optimal performance
|
||||
def configure_system_resources(num_cores=None):
|
||||
"""
|
||||
Configure system resources to optimize training performance
|
||||
|
||||
Args:
|
||||
num_cores: Number of CPU cores to use, if None, automatically detect
|
||||
"""
|
||||
# Automatically detect available cores, if not specified
|
||||
if num_cores is None:
|
||||
num_cores = min(os.cpu_count(), 6) # Limit to 6 cores, match Docker configuration
|
||||
|
||||
logger.info(f"Configuring system to use {num_cores} CPU cores")
|
||||
|
||||
# Set environment variables
|
||||
os.environ["OMP_NUM_THREADS"] = str(num_cores)
|
||||
os.environ["MKL_NUM_THREADS"] = str(num_cores)
|
||||
os.environ["NUMEXPR_NUM_THREADS"] = str(num_cores)
|
||||
|
||||
# Set PyTorch thread count
|
||||
torch.set_num_threads(num_cores)
|
||||
|
||||
# If supported, set PyTorch multi-thread optimization
|
||||
if hasattr(torch, "set_num_interop_threads"):
|
||||
torch.set_num_interop_threads(num_cores)
|
||||
|
||||
# Enable memory-optimized garbage collection
|
||||
# import gc
|
||||
# gc.enable()
|
||||
|
||||
# # Monitor memory usage and clean up periodically
|
||||
# def schedule_gc():
|
||||
# gc.collect()
|
||||
# torch.cuda.empty_cache() if torch.cuda.is_available() else None
|
||||
# return schedule_gc
|
||||
|
||||
# If CUDA is available, set CUDA device
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.set_device(0)
|
||||
logger.info(f"CUDA is available. Using device: {torch.cuda.get_device_name(0)}")
|
||||
# Display CUDA memory information
|
||||
logger.info(f"CUDA memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
|
||||
logger.info(f"CUDA memory reserved: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")
|
||||
# Get memory manager for optimization
|
||||
memory_manager = get_memory_manager()
|
||||
memory_manager.cleanup_memory(force=True)
|
||||
|
||||
# Call function to configure system resources
|
||||
configure_system_resources()
|
||||
# Release Ollama models if they exist to free up VRAM
|
||||
if torch.cuda.is_available() and model_args.use_cuda:
|
||||
release_ollama_models_early()
|
||||
|
||||
logger.info("Initializing training with memory optimizations")
|
||||
set_seed(training_args.seed)
|
||||
|
||||
# Apply PyTorch memory optimizations to training arguments
|
||||
logger.info("Applying memory optimizations to training configuration")
|
||||
training_args = memory_manager.optimize_training_args(training_args)
|
||||
|
||||
# --- Accelerate optimizer state offloading logic ---
|
||||
# Enable optimizer state offload to CPU if VRAM is low and not using DeepSpeed
|
||||
vram_total = memory_manager.get_memory_info().get("vram_total_gb", 0)
|
||||
use_accelerate_offload = False
|
||||
if torch.cuda.is_available() and model_args.use_cuda and vram_total > 0 and vram_total < 16:
|
||||
# Only set if not already using DeepSpeed
|
||||
if not hasattr(training_args, "deepspeed") or training_args.deepspeed is None:
|
||||
logger.info("Enabling Hugging Face Accelerate optimizer state offload to CPU for low VRAM GPUs")
|
||||
accelerate_config = {
|
||||
"compute_environment": "LOCAL_MACHINE",
|
||||
"deepspeed_config": None,
|
||||
"distributed_type": "NO",
|
||||
"downcast_bf16": False,
|
||||
"fsdp_config": {},
|
||||
"main_training_function": "main",
|
||||
"mixed_precision": "no",
|
||||
"num_machines": 1,
|
||||
"num_processes": 1,
|
||||
"use_cpu": False,
|
||||
"zero3_init_flag": False,
|
||||
"offload_optimizer_device": "cpu",
|
||||
"offload_param_device": "none"
|
||||
}
|
||||
training_args.accelerate_config = accelerate_config
|
||||
use_accelerate_offload = True
|
||||
|
||||
# Model loading with device_map="auto" for automatic offloading
|
||||
logger.info(f"Loading model with automatic memory management from {model_args.model_name_or_path}")
|
||||
|
||||
# Create model arguments dict with automatic offloading
|
||||
model_kwargs = {
|
||||
# Don't use "auto" device_map initially to avoid meta tensor issues
|
||||
"device_map": None,
|
||||
"trust_remote_code": True
|
||||
}
|
||||
|
||||
# Configure quantization if requested
|
||||
if model_args.use_4bit_quantization:
|
||||
from transformers import BitsAndBytesConfig
|
||||
compute_dtype = getattr(torch, model_args.bnb_4bit_compute_dtype)
|
||||
quant_storage_dtype = getattr(torch, model_args.bnb_4bit_quant_storage_dtype)
|
||||
|
||||
model_kwargs["quantization_config"] = BitsAndBytesConfig(
|
||||
load_in_4bit=model_args.use_4bit_quantization,
|
||||
bnb_4bit_quant_type=model_args.bnb_4bit_quant_type,
|
||||
bnb_4bit_compute_dtype=compute_dtype,
|
||||
bnb_4bit_use_double_quant=model_args.use_nested_quant,
|
||||
bnb_4bit_quant_storage=quant_storage_dtype,
|
||||
)
|
||||
# For 4-bit models, we can use device_map="auto"
|
||||
model_kwargs["device_map"] = "auto"
|
||||
logger.info("Using 4-bit quantization for memory efficiency")
|
||||
elif model_args.use_8bit_quantization:
|
||||
from transformers import BitsAndBytesConfig
|
||||
model_kwargs["quantization_config"] = BitsAndBytesConfig(
|
||||
load_in_8bit=model_args.use_8bit_quantization
|
||||
)
|
||||
# For 8-bit models, we can use device_map="auto"
|
||||
model_kwargs["device_map"] = "auto"
|
||||
logger.info("Using 8-bit quantization for memory efficiency")
|
||||
|
||||
# Flash attention for memory efficiency when supported
|
||||
if model_args.use_flash_attn and torch.cuda.is_available() and model_args.use_cuda:
|
||||
model_kwargs["attn_implementation"] = "flash_attention_2"
|
||||
logger.info("Using Flash Attention 2 for memory efficiency")
|
||||
|
||||
# Load model with built-in memory management features
|
||||
model, peft_config, tokenizer = create_and_prepare_model(
|
||||
model_args, data_args, training_args, model_kwargs=model_kwargs
|
||||
)
|
||||
|
||||
# If model has meta tensors, handle them properly
|
||||
if hasattr(model, "is_meta") and model.is_meta:
|
||||
logger.info("Model has meta tensors, using to_empty() to properly initialize")
|
||||
device = "cuda" if torch.cuda.is_available() and model_args.use_cuda else "cpu"
|
||||
model = model.to_empty(device=device)
|
||||
|
||||
# Apply gradient checkpointing for memory efficiency
|
||||
if training_args.gradient_checkpointing and hasattr(model, "gradient_checkpointing_enable"):
|
||||
logger.info("Enabling gradient checkpointing for memory efficiency")
|
||||
model.gradient_checkpointing_enable()
|
||||
model.config.use_cache = False
|
||||
|
||||
# Allow only one full forward/backward pass at a time (if needed for memory)
|
||||
if torch.cuda.is_available() and memory_manager.get_memory_info().get("vram_total_gb", 0) < 8:
|
||||
torch.cuda.set_per_process_memory_fraction(0.9)
|
||||
logger.info("Setting memory fraction limit to avoid OOM errors")
|
||||
|
||||
# datasets
|
||||
train_dataset = create_chat_data(
|
||||
data_args,
|
||||
tokenizer,
|
||||
)
|
||||
|
||||
|
||||
response_template = "\n<|im_start|>assistant\n"
|
||||
|
||||
|
||||
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
|
||||
|
||||
training_args.dataset_kwargs = {
|
||||
|
@ -243,6 +306,45 @@ def main(model_args, data_args, training_args):
|
|||
"add_special_tokens": data_args.add_special_tokens,
|
||||
}
|
||||
|
||||
# Use DeepSpeed to handle meta tensors if available
|
||||
try:
|
||||
# Only configure DeepSpeed if meta tensors are present and DeepSpeed is available
|
||||
if hasattr(model, "is_meta") and model.is_meta:
|
||||
logger.info("Model has meta tensors, checking DeepSpeed availability")
|
||||
# First verify DeepSpeed is properly installed and importable
|
||||
try:
|
||||
import deepspeed
|
||||
logger.info("DeepSpeed is available, configuring for meta tensor handling")
|
||||
|
||||
# Configure with appropriate settings for meta tensors
|
||||
training_args.deepspeed = {
|
||||
"zero_stage": 3,
|
||||
"offload_optimizer": {
|
||||
"device": "cpu"
|
||||
},
|
||||
"offload_param": {
|
||||
"device": "cpu"
|
||||
},
|
||||
"zero3_init_flag": True,
|
||||
"zero_force_ds_cpu_optimizer": False
|
||||
}
|
||||
logger.info("DeepSpeed configured for meta tensor handling")
|
||||
except ImportError:
|
||||
logger.warning("DeepSpeed is not available, meta tensors will be handled differently")
|
||||
# If DeepSpeed isn't available, use alternative approach to handle meta tensors
|
||||
if torch.cuda.is_available() and model_args.use_cuda:
|
||||
logger.info("Initializing meta tensors on GPU")
|
||||
# Use device_map instead of DeepSpeed for meta tensor initialization
|
||||
from accelerate import init_empty_weights
|
||||
with init_empty_weights():
|
||||
model.to_empty(device="cuda")
|
||||
else:
|
||||
logger.info("Initializing meta tensors on CPU")
|
||||
model.to_empty(device="cpu")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not configure meta tensor handling: {e}")
|
||||
logger.warning(traceback.format_exc())
|
||||
|
||||
trainer = SFTTrainer(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
|
@ -252,145 +354,46 @@ def main(model_args, data_args, training_args):
|
|||
formatting_func=formatting_prompts_func,
|
||||
data_collator=collator,
|
||||
)
|
||||
|
||||
# Print model details
|
||||
trainer.accelerator.print(f"{trainer.model}")
|
||||
trainer.model.print_trainable_parameters()
|
||||
|
||||
logger.info("start 6")
|
||||
# train
|
||||
checkpoint = None
|
||||
if training_args.resume_from_checkpoint is not None:
|
||||
logger.info("start 6.1")
|
||||
checkpoint = training_args.resume_from_checkpoint
|
||||
logger.info("start 6.2")
|
||||
|
||||
class DebugCallback(transformers.TrainerCallback):
|
||||
"""
|
||||
Debug callback to monitor training process
|
||||
"""
|
||||
|
||||
|
||||
if hasattr(trainer.model, "print_trainable_parameters"):
|
||||
trainer.model.print_trainable_parameters()
|
||||
|
||||
# Memory usage tracking callback
|
||||
class MemoryMonitorCallback(transformers.TrainerCallback):
|
||||
def __init__(self):
|
||||
self.step_times = {}
|
||||
self.current_step_start = None
|
||||
|
||||
def on_train_begin(self, args, state, control, **kwargs):
|
||||
logger.info("=== Training Begin ===")
|
||||
logger.info("Checking initial conditions:")
|
||||
trainer = kwargs.get("trainer")
|
||||
if trainer:
|
||||
# Check model status
|
||||
logger.info(f"Model device: {trainer.model.device}")
|
||||
logger.info(f"Model dtype: {next(trainer.model.parameters()).dtype}")
|
||||
|
||||
# Check data loader
|
||||
if hasattr(trainer, "train_dataset"):
|
||||
logger.info(f"Training dataset size: {len(trainer.train_dataset)}")
|
||||
|
||||
# Check optimizer
|
||||
if hasattr(trainer, "optimizer"):
|
||||
logger.info("Optimizer configuration:")
|
||||
for i, group in enumerate(trainer.optimizer.param_groups):
|
||||
logger.info(
|
||||
f"Group {i}: lr={group['lr']}, weight_decay={group['weight_decay']}"
|
||||
)
|
||||
|
||||
def on_step_begin(self, args, state, control, **kwargs):
|
||||
self.current_step_start = time.time()
|
||||
logger.info(f"\n=== Starting Step {state.global_step + 1} ===")
|
||||
|
||||
# Check system status every 10 steps
|
||||
if state.global_step % 10 == 0:
|
||||
process = psutil.Process()
|
||||
with process.oneshot():
|
||||
logger.info(f"CPU Usage: {process.cpu_percent()}%")
|
||||
logger.info(
|
||||
f"Memory Usage: {process.memory_info().rss / 1024**2:.2f}MB"
|
||||
)
|
||||
logger.info(f"Thread Count: {process.num_threads()}")
|
||||
|
||||
self.memory_manager = get_memory_manager()
|
||||
|
||||
def on_step_end(self, args, state, control, **kwargs):
|
||||
if self.current_step_start:
|
||||
step_time = time.time() - self.current_step_start
|
||||
self.step_times[state.global_step] = step_time
|
||||
avg_time = sum(self.step_times.values()) / len(self.step_times)
|
||||
logger.info(
|
||||
f"Step {state.global_step + 1} completed in {step_time:.2f}s (avg: {avg_time:.2f}s)"
|
||||
)
|
||||
|
||||
# Check if step time is much longer than average
|
||||
if step_time > avg_time * 2 and len(self.step_times) > 1:
|
||||
logger.warning(
|
||||
f"Step {state.global_step + 1} took {step_time:.2f}s, which is much longer than average!"
|
||||
)
|
||||
|
||||
trainer = kwargs.get("trainer")
|
||||
if trainer and hasattr(trainer, "optimizer"):
|
||||
# Check gradient status
|
||||
grad_norms = []
|
||||
for name, param in trainer.model.named_parameters():
|
||||
if param.grad is not None:
|
||||
grad_norms.append(param.grad.norm().item())
|
||||
|
||||
if grad_norms:
|
||||
avg_grad_norm = sum(grad_norms) / len(grad_norms)
|
||||
logger.info(f"Average gradient norm: {avg_grad_norm:.5f}")
|
||||
else:
|
||||
logger.warning("No gradients found in this step!")
|
||||
|
||||
def on_log(self, args, state, control, logs=None, **kwargs):
|
||||
if logs:
|
||||
logger.info(f"=== Logs for Step {state.global_step} ===")
|
||||
for key, value in logs.items():
|
||||
logger.info(f"{key}: {value}")
|
||||
|
||||
def on_train_end(self, args, state, control, **kwargs):
|
||||
logger.info("=== Training Ended ===")
|
||||
logger.info(f"Total steps completed: {state.global_step}")
|
||||
if self.step_times:
|
||||
avg_time = sum(self.step_times.values()) / len(self.step_times)
|
||||
logger.info(f"Average step time: {avg_time:.2f}s")
|
||||
# Check memory every 5 steps
|
||||
if state.global_step % 5 == 0 and torch.cuda.is_available():
|
||||
info = self.memory_manager.get_memory_info()
|
||||
vram_usage_pct = info.get("vram_used_gb", 0) / info.get("vram_total_gb", 1) * 100
|
||||
|
||||
if vram_usage_pct > 90:
|
||||
logger.info(f"VRAM usage high ({vram_usage_pct:.1f}%), cleaning cache")
|
||||
self.memory_manager.cleanup_memory()
|
||||
|
||||
def on_save(self, args, state, control, **kwargs):
|
||||
# Free up memory before saving
|
||||
self.memory_manager.cleanup_memory(force=True)
|
||||
|
||||
# Add memory monitoring
|
||||
trainer.add_callback(MemoryMonitorCallback())
|
||||
|
||||
# Add existing debug callback
|
||||
trainer.add_callback(DebugCallback())
|
||||
|
||||
# Add more detailed logs
|
||||
logger.info("Starting training preparation...")
|
||||
# Resume from checkpoint if specified
|
||||
checkpoint = None
|
||||
if training_args.resume_from_checkpoint is not None:
|
||||
checkpoint = training_args.resume_from_checkpoint
|
||||
|
||||
# Training with automatic memory management
|
||||
try:
|
||||
logger.info("Initializing training process...")
|
||||
# Check model loading and structure
|
||||
logger.info("Analyzing model structure...")
|
||||
model = trainer.model
|
||||
|
||||
def print_model_structure(model, prefix=""):
|
||||
logger.info(f"{prefix}Model class: {model.__class__.__name__}")
|
||||
for name, child in model.named_children():
|
||||
logger.info(f"{prefix}Child: {name} ({child.__class__.__name__})")
|
||||
if len(list(child.named_children())) > 0:
|
||||
print_model_structure(child, prefix + " ")
|
||||
|
||||
# print_model_structure(model)
|
||||
|
||||
# Check model size
|
||||
total_params = sum(p.numel() for p in trainer.model.parameters())
|
||||
trainable_params = sum(
|
||||
p.numel() for p in trainer.model.parameters() if p.requires_grad
|
||||
)
|
||||
logger.info(f"Total parameters: {total_params:,}")
|
||||
logger.info(f"Trainable parameters: {trainable_params:,}")
|
||||
|
||||
# Check optimizer settings
|
||||
logger.info("Checking optimizer settings...")
|
||||
|
||||
# Check data loader
|
||||
train_dataloader = trainer.get_train_dataloader()
|
||||
logger.info(f"Train dataloader created with {len(train_dataloader)} batches")
|
||||
|
||||
process = psutil.Process()
|
||||
memory_info = process.memory_info()
|
||||
logger.info(f"Memory usage details:")
|
||||
logger.info(f"RSS (Resident Set Size): {memory_info.rss / 1024**2:.2f}MB")
|
||||
logger.info(f"VMS (Virtual Memory Size): {memory_info.vms / 1024**2:.2f}MB")
|
||||
|
||||
# Start training
|
||||
logger.info("Starting actual training process...")
|
||||
logger.info("Starting training with memory-optimized configuration")
|
||||
trainer.train(resume_from_checkpoint=checkpoint)
|
||||
except Exception as e:
|
||||
logger.error(f"Error during training: {str(e)}")
|
||||
|
@ -398,9 +401,13 @@ def main(model_args, data_args, training_args):
|
|||
logger.error(f"Traceback: {traceback.format_exc()}")
|
||||
raise
|
||||
|
||||
logger.info("start 7")
|
||||
# Save the model
|
||||
if trainer.is_fsdp_enabled:
|
||||
trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
|
||||
|
||||
# Clean up before saving
|
||||
memory_manager.cleanup_memory(force=True)
|
||||
|
||||
trainer.save_model()
|
||||
logger.info("Training completed successfully")
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@ NUM_TRAIN_EPOCHS="3"
|
|||
CONCURRENCY_THREADS="2"
|
||||
DATA_SYNTHESIS_MODE="low"
|
||||
HALF=False
|
||||
USE_CUDA=False # Default to False, will be overridden by parameter
|
||||
IS_COT=False
|
||||
|
||||
# Process parameters
|
||||
|
@ -15,33 +16,75 @@ while [[ "$#" -gt 0 ]]; do
|
|||
--epochs) NUM_TRAIN_EPOCHS="$2"; shift ;;
|
||||
--threads) CONCURRENCY_THREADS="$2"; shift ;;
|
||||
--mode) DATA_SYNTHESIS_MODE="$2"; shift ;;
|
||||
--cuda)
|
||||
# Convert string to lowercase for consistent comparison
|
||||
cuda_value=$(echo "$2" | tr '[:upper:]' '[:lower:]')
|
||||
if [[ "$cuda_value" == "true" || "$cuda_value" == "1" || "$cuda_value" == "yes" ]]; then
|
||||
USE_CUDA=True
|
||||
echo "CUDA enabled by user configuration."
|
||||
else
|
||||
USE_CUDA=False
|
||||
echo "CUDA disabled by user configuration."
|
||||
fi
|
||||
shift ;;
|
||||
--is_cot) IS_COT="$2"; shift ;;
|
||||
*) echo "Unknown parameter: $1"; exit 1 ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
# Explicitly log the CUDA setting passed from the command line
|
||||
echo "CUDA parameter received: $USE_CUDA"
|
||||
|
||||
# Verify CUDA availability if enabled
|
||||
if [[ "$USE_CUDA" == "True" ]]; then
|
||||
# Set CUDA environment variables to ensure PyTorch detects GPU
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
echo "CUDA_VISIBLE_DEVICES set to 0"
|
||||
|
||||
# Set CUDA_LAUNCH_BLOCKING to 0 for async operations (better performance)
|
||||
export CUDA_LAUNCH_BLOCKING=0
|
||||
echo "CUDA_LAUNCH_BLOCKING set to 0 for better performance"
|
||||
else
|
||||
# Explicitly disable CUDA
|
||||
export CUDA_VISIBLE_DEVICES=""
|
||||
echo "CUDA_VISIBLE_DEVICES explicitly disabled"
|
||||
fi
|
||||
|
||||
# Log the parameters being used
|
||||
echo "Using training parameters:"
|
||||
echo " Learning rate: $LEARNING_RATE"
|
||||
echo " Number of epochs: $NUM_TRAIN_EPOCHS"
|
||||
echo " Concurrency threads: $CONCURRENCY_THREADS"
|
||||
echo " Data synthesis mode: $DATA_SYNTHESIS_MODE"
|
||||
echo " Use CUDA: $USE_CUDA"
|
||||
echo " Is chain of thought: $IS_COT"
|
||||
|
||||
# If concurrency threads are set, configure related environment variables
|
||||
if [ "$CONCURRENCY_THREADS" != "1" ]; then
|
||||
# Limit the number of parallel threads to avoid memory issues
|
||||
export OMP_NUM_THREADS=$CONCURRENCY_THREADS
|
||||
export MKL_NUM_THREADS=$CONCURRENCY_THREADS
|
||||
export NUMEXPR_NUM_THREADS=$CONCURRENCY_THREADS
|
||||
# Add torch-specific threading controls
|
||||
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
|
||||
echo "Set thread environment variables to $CONCURRENCY_THREADS"
|
||||
fi
|
||||
|
||||
# Add BF16 option based on the platform
|
||||
if [ "$PLATFORM" != "apple" ]; then
|
||||
# Add BF16 option based on the platform and CUDA availability
|
||||
if [ "$PLATFORM" != "apple" ] && [ "$USE_CUDA" == "True" ]; then
|
||||
HALF=True
|
||||
echo "Enabling BF16 half precision for non-Apple platform with CUDA"
|
||||
else
|
||||
echo "Using standard precision (not using BF16)"
|
||||
fi
|
||||
|
||||
# Print environment for debugging
|
||||
echo "Environment configuration:"
|
||||
echo " CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}"
|
||||
echo " PYTORCH_CUDA_ALLOC_CONF: ${PYTORCH_CUDA_ALLOC_CONF}"
|
||||
echo " Using half precision: ${HALF}"
|
||||
|
||||
# Execute training script with parameters from environment variables
|
||||
python lpm_kernel/L2/train.py \
|
||||
--seed 42 \
|
||||
|
@ -70,7 +113,7 @@ python lpm_kernel/L2/train.py \
|
|||
--per_device_train_batch_size 2 \
|
||||
--gradient_accumulation_steps $CONCURRENCY_THREADS \
|
||||
--gradient_checkpointing True \
|
||||
--use_reentrant True \
|
||||
--use_reentrant False \
|
||||
--use_peft_lora True \
|
||||
--lora_r 8 \
|
||||
--lora_alpha 16 \
|
||||
|
@ -78,6 +121,7 @@ python lpm_kernel/L2/train.py \
|
|||
--lora_target_modules "all-linear" \
|
||||
--use_4bit_quantization False \
|
||||
--use_nested_quant False \
|
||||
--bnb_4bit_compute_dtype "bfloat16"\
|
||||
--is_cot $IS_COT
|
||||
--bnb_4bit_compute_dtype "bfloat16" \
|
||||
--is_cot $IS_COT \
|
||||
--use_cuda $USE_CUDA
|
||||
|
||||
|
|
|
@ -33,12 +33,171 @@ from lpm_kernel.L2.training_prompt import (
|
|||
MEMORY_COT_PROMPT,
|
||||
)
|
||||
|
||||
# Add import for memory manager
|
||||
from .memory_manager import get_memory_manager
|
||||
import gc
|
||||
import requests
|
||||
|
||||
# Initialize the logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Default chat templates for different model formats
|
||||
DEFAULT_CHATML_CHAT_TEMPLATE = "{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}"
|
||||
DEFAULT_ZEPHYR_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
|
||||
|
||||
|
||||
def release_ollama_models_early():
|
||||
"""Release Ollama models from memory as early as possible before model loading.
|
||||
|
||||
This function uses the Ollama API with keep_alive=0 parameter to properly unload models
|
||||
and free up VRAM before loading the training model.
|
||||
"""
|
||||
try:
|
||||
from lpm_kernel.api.services.user_llm_config_service import UserLLMConfigService
|
||||
import json
|
||||
|
||||
logger.info("Early release of Ollama models to free up VRAM for training")
|
||||
|
||||
# Get current user LLM config to identify models to release
|
||||
user_llm_config_service = UserLLMConfigService()
|
||||
user_llm_config = user_llm_config_service.get_available_llm()
|
||||
|
||||
if not user_llm_config:
|
||||
logger.warning("No user LLM configuration found. Skipping Ollama model release.")
|
||||
return
|
||||
|
||||
# Track which models have been released
|
||||
released_models = set()
|
||||
|
||||
def get_generate_url(base_endpoint):
|
||||
"""Helper function to get the API endpoint for unloading models"""
|
||||
if not base_endpoint:
|
||||
return None
|
||||
|
||||
base_url = base_endpoint.rstrip("/")
|
||||
|
||||
# Convert to API base URL if needed (may be v1 format or direct ollama format)
|
||||
if "/v1/" in base_url:
|
||||
api_base = base_url.split("/v1/")[0]
|
||||
return f"{api_base}/api/generate"
|
||||
else:
|
||||
# Check if this is a non-localhost Ollama instance
|
||||
if "ollama" in base_url.lower():
|
||||
if "localhost" in base_url or "127.0.0.1" in base_url:
|
||||
return "http://localhost:11434/api/generate"
|
||||
else:
|
||||
# Extract the base URL and use it
|
||||
parts = base_url.split("//")
|
||||
if len(parts) > 1:
|
||||
host = parts[1].split("/")[0]
|
||||
return f"{parts[0]}//{host}/api/generate"
|
||||
|
||||
# Default ollama endpoint as fallback
|
||||
return "http://localhost:11434/api/generate"
|
||||
|
||||
# Release chat model if using Ollama
|
||||
if "ollama" in user_llm_config.chat_endpoint.lower() and user_llm_config.chat_model_name:
|
||||
chat_model = user_llm_config.chat_model_name
|
||||
generate_url = get_generate_url(user_llm_config.chat_endpoint)
|
||||
|
||||
if not generate_url:
|
||||
logger.warning(f"Could not determine API endpoint for chat model: {chat_model}")
|
||||
else:
|
||||
logger.info(f"Releasing Ollama chat model: {chat_model} via {generate_url}")
|
||||
|
||||
try:
|
||||
# Set up headers with API key if provided
|
||||
headers = {
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
if user_llm_config.chat_api_key:
|
||||
headers["Authorization"] = f"Bearer {user_llm_config.chat_api_key}"
|
||||
|
||||
# Use the proper generate endpoint with keep_alive=0 to unload
|
||||
payload = {
|
||||
"model": chat_model,
|
||||
"keep_alive": 0,
|
||||
"prompt": " " # Minimal prompt needed for request
|
||||
}
|
||||
|
||||
unload_response = requests.post(
|
||||
generate_url,
|
||||
headers=headers,
|
||||
data=json.dumps(payload),
|
||||
timeout=30 # Add timeout to prevent hanging
|
||||
)
|
||||
|
||||
if unload_response.status_code < 300:
|
||||
logger.info(f"✅ Successfully unloaded chat model: {chat_model}")
|
||||
released_models.add(chat_model)
|
||||
else:
|
||||
logger.warning(f"Failed to unload model via API: {unload_response.status_code} - {unload_response.text}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to release chat model {chat_model}: {str(e)}")
|
||||
|
||||
# Release embedding model if different from chat model and using Ollama
|
||||
if (user_llm_config.embedding_model_name and
|
||||
"ollama" in user_llm_config.embedding_endpoint.lower() and
|
||||
user_llm_config.embedding_model_name != user_llm_config.chat_model_name and
|
||||
user_llm_config.embedding_model_name not in released_models):
|
||||
|
||||
embedding_model = user_llm_config.embedding_model_name
|
||||
generate_url = get_generate_url(user_llm_config.embedding_endpoint)
|
||||
|
||||
if not generate_url:
|
||||
logger.warning(f"Could not determine API endpoint for embedding model: {embedding_model}")
|
||||
else:
|
||||
logger.info(f"Releasing Ollama embedding model: {embedding_model} via {generate_url}")
|
||||
|
||||
try:
|
||||
# Set up headers with API key if provided
|
||||
headers = {
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
if user_llm_config.embedding_api_key:
|
||||
headers["Authorization"] = f"Bearer {user_llm_config.embedding_api_key}"
|
||||
|
||||
# Use the proper generate endpoint with keep_alive=0 to unload
|
||||
payload = {
|
||||
"model": embedding_model,
|
||||
"keep_alive": 0,
|
||||
"prompt": " " # Minimal prompt needed for request
|
||||
}
|
||||
|
||||
unload_response = requests.post(
|
||||
generate_url,
|
||||
headers=headers,
|
||||
data=json.dumps(payload),
|
||||
timeout=30 # Add timeout to prevent hanging
|
||||
)
|
||||
|
||||
if unload_response.status_code < 300:
|
||||
logger.info(f"✅ Successfully unloaded embedding model: {embedding_model}")
|
||||
released_models.add(embedding_model)
|
||||
else:
|
||||
logger.warning(f"Failed to unload model via API: {unload_response.status_code} - {unload_response.text}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to release embedding model {embedding_model}: {str(e)}")
|
||||
|
||||
# Final cleanup and verification
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
memory_info = get_memory_manager().get_memory_info()
|
||||
vram_used = memory_info.get('vram_used_gb', 0)
|
||||
vram_total = memory_info.get('vram_total_gb', 1)
|
||||
logger.info(f"VRAM after early model release: {vram_used:.2f}GB / {vram_total:.2f}GB ({vram_used/vram_total*100:.1f}%)")
|
||||
|
||||
if released_models:
|
||||
logger.info(f"Early release completed for {len(released_models)} Ollama models: {', '.join(released_models)}")
|
||||
else:
|
||||
logger.info("No Ollama models were released early")
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
logger.error(f"Error in early Ollama model release: {str(e)}")
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
|
||||
def count_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
|
||||
"""Returns the number of tokens in a text string using a specified encoding.
|
||||
|
||||
|
@ -105,21 +264,31 @@ class ZephyrSpecialTokens(str, Enum):
|
|||
"""Returns a list of all special tokens."""
|
||||
return [token.value for token in cls]
|
||||
|
||||
def create_and_prepare_model(args, data_args, training_args):
|
||||
def create_and_prepare_model(args, data_args, training_args, model_kwargs=None):
|
||||
"""Creates and prepares a model for training.
|
||||
|
||||
Args:
|
||||
args: Model arguments containing model configuration.
|
||||
data_args: Data arguments for training.
|
||||
training_args: Training configuration arguments.
|
||||
model_kwargs: Additional kwargs to pass to the model loading function.
|
||||
|
||||
Returns:
|
||||
Tuple of (model, tokenizer) ready for training.
|
||||
Tuple of (model, tokenizer, peft_config) ready for training.
|
||||
"""
|
||||
# Get the memory manager for adaptive loading
|
||||
memory_manager = get_memory_manager()
|
||||
model_kwargs = model_kwargs or {}
|
||||
|
||||
# Release Ollama models early before we load any models
|
||||
if torch.cuda.is_available() and args.use_cuda:
|
||||
release_ollama_models_early()
|
||||
# Force cleanup memory after releasing Ollama models
|
||||
memory_manager.cleanup_memory(force=True)
|
||||
|
||||
if args.use_unsloth:
|
||||
from unsloth import FastLanguageModel
|
||||
bnb_config = None
|
||||
quant_storage_dtype = None
|
||||
|
||||
if (
|
||||
torch.distributed.is_available()
|
||||
|
@ -129,55 +298,142 @@ def create_and_prepare_model(args, data_args, training_args):
|
|||
):
|
||||
raise NotImplementedError("Unsloth is not supported in distributed training")
|
||||
|
||||
if args.use_4bit_quantization:
|
||||
compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
|
||||
quant_storage_dtype = getattr(torch, args.bnb_4bit_quant_storage_dtype)
|
||||
# Clean up memory before loading model
|
||||
memory_manager.cleanup_memory()
|
||||
|
||||
# Check for CUDA availability and use it if enabled
|
||||
cuda_available = torch.cuda.is_available()
|
||||
use_cuda_requested = args.use_cuda
|
||||
device = "cpu"
|
||||
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=args.use_4bit_quantization,
|
||||
bnb_4bit_quant_type=args.bnb_4bit_quant_type,
|
||||
bnb_4bit_compute_dtype=compute_dtype,
|
||||
bnb_4bit_use_double_quant=args.use_nested_quant,
|
||||
bnb_4bit_quant_storage=quant_storage_dtype,
|
||||
)
|
||||
# Always enable memory-adaptive loading by default (device_map="auto"), unless CUDA is off
|
||||
if cuda_available and use_cuda_requested:
|
||||
device = "cuda"
|
||||
model_kwargs["device_map"] = "auto"
|
||||
else:
|
||||
if use_cuda_requested and not cuda_available:
|
||||
logger.warning("⚠️ CUDA was requested but is not available on this system. Falling back to CPU.")
|
||||
elif cuda_available and not use_cuda_requested:
|
||||
logger.info("ℹ️ CUDA is available but not requested. Using CPU as specified.")
|
||||
else:
|
||||
logger.info("ℹ️ CUDA is not available. Using CPU for training.")
|
||||
# Explicitly remove device_map to force CPU-only
|
||||
if "device_map" in model_kwargs:
|
||||
model_kwargs.pop("device_map")
|
||||
logger.info("Using CPU for model training and inference.")
|
||||
|
||||
if compute_dtype == torch.float16 and args.use_4bit_quantization:
|
||||
major, _ = torch.cuda.get_device_capability()
|
||||
if major >= 8:
|
||||
logging.info("=" * 80)
|
||||
logging.info(
|
||||
"Your GPU supports bfloat16, you can accelerate training with the argument --bf16"
|
||||
)
|
||||
logging.info("=" * 80)
|
||||
# Configure quantization based on available memory
|
||||
# Use model_kwargs quantization_config if provided, otherwise build it
|
||||
if "quantization_config" not in model_kwargs:
|
||||
if args.use_4bit_quantization:
|
||||
compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
|
||||
quant_storage_dtype = getattr(torch, args.bnb_4bit_quant_storage_dtype)
|
||||
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=args.use_4bit_quantization,
|
||||
bnb_4bit_quant_type=args.bnb_4bit_quant_type,
|
||||
bnb_4bit_compute_dtype=compute_dtype,
|
||||
bnb_4bit_use_double_quant=args.use_nested_quant,
|
||||
bnb_4bit_quant_storage=quant_storage_dtype,
|
||||
)
|
||||
model_kwargs["quantization_config"] = bnb_config
|
||||
|
||||
if compute_dtype == torch.float16 and args.use_4bit_quantization:
|
||||
major, _ = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0)
|
||||
if major >= 8:
|
||||
logger.info("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
|
||||
elif args.use_8bit_quantization:
|
||||
bnb_config = BitsAndBytesConfig(load_in_8bit=args.use_8bit_quantization)
|
||||
model_kwargs["quantization_config"] = bnb_config
|
||||
|
||||
if args.use_unsloth:
|
||||
# Load model
|
||||
model, _ = FastLanguageModel.from_pretrained(
|
||||
model_name=args.model_name_or_path,
|
||||
max_seq_length=data_args.max_seq_length,
|
||||
dtype=None,
|
||||
load_in_4bit=args.use_4bit_quantization,
|
||||
)
|
||||
else:
|
||||
if os.getenv("PLATFORM") != "apple":
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
args.model_name_or_path,
|
||||
quantization_config=bnb_config,
|
||||
trust_remote_code=True,
|
||||
attn_implementation="flash_attention_2" if args.use_flash_attn else "eager",
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
# Load model with memory-adaptive approach
|
||||
model = None
|
||||
tokenizer = None
|
||||
peft_config = None
|
||||
|
||||
try:
|
||||
# First try loading the model with the requested configuration
|
||||
if args.use_unsloth:
|
||||
# Load model with Unsloth using memory manager
|
||||
unsloth_kwargs = {
|
||||
"model_name": args.model_name_or_path,
|
||||
"max_seq_length": data_args.max_seq_length,
|
||||
"dtype": None,
|
||||
"load_in_4bit": args.use_4bit_quantization,
|
||||
"load_in_8bit": args.use_8bit_quantization,
|
||||
"trust_remote_code": True,
|
||||
"device_map": model_kwargs.get("device_map", "auto") if args.use_cuda and torch.cuda.is_available() else None,
|
||||
}
|
||||
|
||||
logger.info(f"Loading model with Unsloth with parameters: {unsloth_kwargs}")
|
||||
model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs)
|
||||
|
||||
else:
|
||||
# Load model with standard approach
|
||||
load_kwargs = {
|
||||
"trust_remote_code": True,
|
||||
}
|
||||
|
||||
# Use any provided model_kwargs
|
||||
load_kwargs.update(model_kwargs)
|
||||
|
||||
if "attn_implementation" not in load_kwargs and args.use_flash_attn:
|
||||
load_kwargs["attn_implementation"] = "flash_attention_2"
|
||||
|
||||
# Set default device_map if not specified
|
||||
if "device_map" not in load_kwargs and args.use_cuda and torch.cuda.is_available():
|
||||
load_kwargs["device_map"] = "auto"
|
||||
|
||||
logger.info(f"Loading model with parameters: {load_kwargs}")
|
||||
model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, **load_kwargs)
|
||||
|
||||
except (RuntimeError, torch.cuda.OutOfMemoryError, MemoryError) as e:
|
||||
# If standard approaches fail, try progressive fallbacks
|
||||
logger.warning(f"Failed to load model with standard settings: {str(e)}")
|
||||
logger.info("Falling back to adaptive model loading...")
|
||||
|
||||
# First cleanup to ensure maximum memory available
|
||||
memory_manager.cleanup_memory(force=True)
|
||||
|
||||
try:
|
||||
# Try with simpler configuration - float16 instead of bfloat16
|
||||
logger.info("Attempting to load with float16 precision...")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
args.model_name_or_path,
|
||||
quantization_config=bnb_config,
|
||||
device_map="auto" if torch.cuda.is_available() and args.use_cuda else None,
|
||||
torch_dtype=torch.float16 if torch.cuda.is_available() and args.use_cuda else None,
|
||||
trust_remote_code=True
|
||||
)
|
||||
except (RuntimeError, torch.cuda.OutOfMemoryError, MemoryError) as e:
|
||||
# If that fails too, try even more conservative loading
|
||||
logger.warning(f"Float16 loading failed: {str(e)}")
|
||||
memory_manager.cleanup_memory(force=True)
|
||||
|
||||
try:
|
||||
# Try with CPU offloading and gradual loading
|
||||
logger.info("Attempting most conservative loading with CPU offloading...")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
args.model_name_or_path,
|
||||
device_map="auto",
|
||||
offload_folder="offload_folder",
|
||||
offload_state_dict=True,
|
||||
torch_dtype=torch.float16 if torch.cuda.is_available() else None,
|
||||
trust_remote_code=True,
|
||||
low_cpu_mem_usage=True
|
||||
)
|
||||
except Exception as e:
|
||||
# If all fallbacks fail, it's a fatal error
|
||||
logger.error(f"All adaptive loading approaches failed: {str(e)}")
|
||||
raise RuntimeError(f"Failed to load model with any memory adaptation technique: {str(e)}")
|
||||
|
||||
peft_config = None
|
||||
chat_template = None
|
||||
# If still not loaded, it's a fatal error
|
||||
if model is None:
|
||||
raise RuntimeError("Failed to load model with any memory adaptation technique")
|
||||
|
||||
# Apply memory optimization to model
|
||||
model = memory_manager.optimize_model_for_training(model)
|
||||
|
||||
# Configure LoRA if requested
|
||||
if args.use_peft_lora and not args.use_unsloth:
|
||||
peft_config = LoraConfig(
|
||||
lora_alpha=args.lora_alpha,
|
||||
|
@ -190,6 +446,7 @@ def create_and_prepare_model(args, data_args, training_args):
|
|||
else args.lora_target_modules,
|
||||
)
|
||||
|
||||
# Load tokenizer - tokenizers are usually small and don't need memory management
|
||||
special_tokens = None
|
||||
chat_template = None
|
||||
if args.chat_template_format == "chatml":
|
||||
|
@ -214,22 +471,43 @@ def create_and_prepare_model(args, data_args, training_args):
|
|||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
args.model_name_or_path, trust_remote_code=True
|
||||
)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
# Make sure pad_token is set
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
# Apply Unsloth LoRA if requested and check memory status
|
||||
if args.use_unsloth:
|
||||
# Do model patching and add fast LoRA weights
|
||||
model = FastLanguageModel.get_peft_model(
|
||||
model,
|
||||
lora_alpha=args.lora_alpha,
|
||||
lora_dropout=args.lora_dropout,
|
||||
r=args.lora_r,
|
||||
target_modules=args.lora_target_modules.split(",")
|
||||
if args.lora_target_modules != "all-linear"
|
||||
else args.lora_target_modules,
|
||||
use_gradient_checkpointing=training_args.gradient_checkpointing,
|
||||
random_state=training_args.seed,
|
||||
max_seq_length=data_args.max_seq_length,
|
||||
)
|
||||
try:
|
||||
# Clean up first
|
||||
memory_manager.cleanup_memory()
|
||||
|
||||
# Apply LoRA with memory monitoring
|
||||
model = FastLanguageModel.get_peft_model(
|
||||
model,
|
||||
lora_alpha=args.lora_alpha,
|
||||
lora_dropout=args.lora_dropout,
|
||||
r=args.lora_r,
|
||||
target_modules=args.lora_target_modules.split(",")
|
||||
if args.lora_target_modules != "all-linear"
|
||||
else args.lora_target_modules,
|
||||
use_gradient_checkpointing=training_args.gradient_checkpointing,
|
||||
random_state=training_args.seed,
|
||||
max_seq_length=data_args.max_seq_length,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to apply Unsloth LoRA: {str(e)}")
|
||||
# If Unsloth fails, we might need to fall back to standard training
|
||||
if args.use_cuda and torch.cuda.is_available():
|
||||
logger.warning("Low VRAM detected, moving model to CPU")
|
||||
model = model.cpu()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# Final memory status check
|
||||
memory_info = memory_manager.get_memory_info()
|
||||
logger.info(f"Memory after model preparation: RAM: {memory_info['ram_used_gb']:.2f}GB / {memory_info['ram_total_gb']:.2f}GB")
|
||||
if torch.cuda.is_available():
|
||||
logger.info(f"VRAM: {memory_info.get('vram_used_gb', 0):.2f}GB / {memory_info.get('vram_total_gb', 0):.2f}GB")
|
||||
|
||||
return model, peft_config, tokenizer
|
||||
|
||||
|
@ -343,11 +621,11 @@ def setup_logger(log_path, logger_name="download_logger"):
|
|||
return logger
|
||||
|
||||
|
||||
def save_hf_model(model_name="Qwen2.5-0.5B-Instruct", log_file_path=None) -> str:
|
||||
def save_hf_model(model_name=None, log_file_path=None) -> str:
|
||||
"""Saves a Hugging Face model locally.
|
||||
|
||||
Args:
|
||||
model_name: Name of the model to save. Defaults to "Qwen2.5-0.5B-Instruct".
|
||||
model_name: Name of the model to save. If None, will attempt to get from config.
|
||||
log_file_path: Path to save download logs. If None, uses default path.
|
||||
|
||||
Returns:
|
||||
|
@ -360,25 +638,39 @@ def save_hf_model(model_name="Qwen2.5-0.5B-Instruct", log_file_path=None) -> str
|
|||
# Setup logging
|
||||
logger = setup_logger(log_file_path)
|
||||
|
||||
# If no model name provided, attempt to get from training configuration
|
||||
if not model_name:
|
||||
try:
|
||||
from lpm_kernel.configs.config import Config
|
||||
config = Config()
|
||||
model_name = config.get("training", {}).get("model_name")
|
||||
if not model_name:
|
||||
logger.warning("No model name provided and none found in config. Using Qwen2.5-0.5B-Instruct as fallback.")
|
||||
model_name = "Qwen2.5-0.5B-Instruct"
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get model name from config: {str(e)}. Using Qwen2.5-0.5B-Instruct as fallback.")
|
||||
model_name = "Qwen2.5-0.5B-Instruct"
|
||||
|
||||
base_dir = os.path.join(os.getcwd(), "resources/L2/base_models")
|
||||
# Normalize model name and check for path traversal attempts
|
||||
normalized_model_name = os.path.normpath(model_name)
|
||||
if ".." in normalized_model_name or normalized_model_name.startswith("/"):
|
||||
raise ValueError("Invalid model name")
|
||||
raise ValueError("Invalid model name - potential path traversal attempt")
|
||||
|
||||
# Prepare save path
|
||||
save_path = os.path.join(base_dir, normalized_model_name)
|
||||
os.makedirs(save_path, exist_ok=True)
|
||||
|
||||
from huggingface_hub import list_repo_files, configure_http_backend
|
||||
import requests
|
||||
from huggingface_hub import list_repo_files, configure_http_backend, hf_hub_download
|
||||
from tqdm import tqdm
|
||||
from tqdm.contrib.concurrent import thread_map
|
||||
import shutil
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import traceback
|
||||
|
||||
# Set a higher timeout, but remove the unsupported pool_size parameter
|
||||
# Configure HTTP backend more simply
|
||||
try:
|
||||
# Try using the timeout parameter to configure
|
||||
configure_http_backend(timeout=100.0)
|
||||
except TypeError:
|
||||
# If the timeout parameter is also not supported, do not use any parameters
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to configure HTTP backend with timeout: {e}")
|
||||
try:
|
||||
configure_http_backend()
|
||||
except Exception as e:
|
||||
|
@ -391,31 +683,31 @@ def save_hf_model(model_name="Qwen2.5-0.5B-Instruct", log_file_path=None) -> str
|
|||
hf_model_name = f"Qwen/{model_name}"
|
||||
|
||||
try:
|
||||
# First get the list of all files in the repository
|
||||
# Get list of files to download
|
||||
files = list_repo_files(hf_model_name)
|
||||
logger.info(f"Found {len(files)} files to download from {hf_model_name}")
|
||||
|
||||
# Define a function for downloading a single file and recording progress
|
||||
def download_file_with_progress(file_info):
|
||||
filename, file_path = file_info
|
||||
|
||||
def download_file_with_progress(filename):
|
||||
"""Download a single file from the model repository"""
|
||||
local_file_path = os.path.join(save_path, filename)
|
||||
|
||||
# Create directories if they don't exist
|
||||
os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
|
||||
|
||||
# Check if file already exists and is not empty
|
||||
if os.path.exists(local_file_path) and os.path.getsize(local_file_path) > 0:
|
||||
logger.info(f"File already exists: {filename}")
|
||||
return True
|
||||
|
||||
try:
|
||||
# Build the download URL
|
||||
url = f"https://huggingface.co/{hf_model_name}/resolve/main/{filename}"
|
||||
|
||||
# Target file path
|
||||
local_file_path = os.path.join(save_path, filename)
|
||||
os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
|
||||
|
||||
# Check if the file already exists
|
||||
if os.path.exists(local_file_path):
|
||||
logger.info(f"File already exists: {filename}")
|
||||
return filename, True
|
||||
|
||||
# Get file size
|
||||
response = requests.head(url)
|
||||
total_size = int(response.headers.get('content-length', 0))
|
||||
|
||||
# If the size cannot be obtained, set a default value or do not display the percentage
|
||||
# If the size cannot be obtained, set a default value
|
||||
if total_size == 0:
|
||||
logger.info(f"Starting download of file: {filename} (Size unknown)")
|
||||
else:
|
||||
|
@ -423,7 +715,7 @@ def save_hf_model(model_name="Qwen2.5-0.5B-Instruct", log_file_path=None) -> str
|
|||
|
||||
# Create the file to write to
|
||||
with open(local_file_path, 'wb') as f:
|
||||
# Create a progress bar, if the total size is unknown, set it to None
|
||||
# Create a progress bar
|
||||
progress_bar = tqdm(
|
||||
total=total_size if total_size > 0 else None,
|
||||
unit='iB',
|
||||
|
@ -432,84 +724,87 @@ def save_hf_model(model_name="Qwen2.5-0.5B-Instruct", log_file_path=None) -> str
|
|||
disable=False
|
||||
)
|
||||
|
||||
# Define the progress callback function
|
||||
# Define progress callback
|
||||
def progress_callback(current, total):
|
||||
# Update the progress bar
|
||||
progress_bar.update(current - progress_bar.n)
|
||||
|
||||
# Record the log every 1MB (or a value close to 1MB)
|
||||
if current % (1024 * 1024) < 8192: # Record every 1MB
|
||||
# Ensure total is greater than 0 before calculating the percentage
|
||||
if total and total > 0: # Use and to ensure total is not None and greater than 0
|
||||
# Log progress every ~1MB
|
||||
if current % (1024 * 1024) < 8192:
|
||||
if total and total > 0:
|
||||
percent = current / total * 100
|
||||
logger.info(f"File {filename}: Downloaded {current/1024/1024:.2f} MB / {total/1024/1024:.2f} MB ({percent:.2f}%)")
|
||||
else:
|
||||
# If the total size is unknown or 0, only show the downloaded size
|
||||
logger.info(f"File {filename}: Downloaded {current/1024/1024:.2f} MB (total size unknown)")
|
||||
|
||||
# Use the request library to download the file and update the progress
|
||||
# Download file with progress tracking
|
||||
response = requests.get(url, stream=True)
|
||||
if response.status_code == 200:
|
||||
downloaded = 0
|
||||
|
||||
# Check if the response contains the Content-Length header information
|
||||
# Update total size if needed
|
||||
actual_total = int(response.headers.get('content-length', 0))
|
||||
if actual_total > 0 and (total_size == 0 or total_size != actual_total):
|
||||
# If the HEAD request did not return the correct size, but the GET request did, then update the total size
|
||||
total_size = actual_total
|
||||
logger.info(f"Updated file size for {filename}: {total_size / 1024 / 1024:.2f} MB")
|
||||
progress_bar.total = total_size
|
||||
progress_bar.refresh()
|
||||
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if chunk: # Filter out empty chunks that keep the connection alive
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
progress_callback(downloaded, total_size)
|
||||
|
||||
progress_bar.close()
|
||||
logger.info(f"Completed download of file: {filename}")
|
||||
return filename, True
|
||||
return True
|
||||
else:
|
||||
logger.error(f"Failed to download {filename}: HTTP status {response.status_code}")
|
||||
return filename, False
|
||||
failed_files.append(filename)
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading {filename}: {str(e)}")
|
||||
return filename, False
|
||||
logger.error(f"Failed to download {filename}: {str(e)}")
|
||||
failed_files.append(filename)
|
||||
return False
|
||||
|
||||
# Keep track of failed files for potential retry
|
||||
failed_files = []
|
||||
|
||||
# Create a list of file path information
|
||||
file_infos = [(filename, os.path.join(save_path, filename)) for filename in files]
|
||||
|
||||
# Use a thread pool to download all files in parallel
|
||||
logger.info(f"Starting parallel download of {len(file_infos)} files")
|
||||
|
||||
# Use a thread pool to download all files in parallel
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
# Limit the number of concurrent requests to avoid too many requests
|
||||
max_workers = min(10, len(file_infos))
|
||||
results = []
|
||||
# Use ThreadPoolExecutor for parallel downloads with controlled concurrency
|
||||
max_workers = min(8, len(files)) # Limit concurrent downloads to avoid overloading
|
||||
successful_downloads = 0
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
# Submit all download tasks
|
||||
future_to_file = {executor.submit(download_file_with_progress, file_info): file_info[0]
|
||||
for file_info in file_infos}
|
||||
|
||||
# Wait for all tasks to complete and collect results
|
||||
for future in tqdm(future_to_file, desc="Overall Progress", unit="file"):
|
||||
filename = future_to_file[future]
|
||||
try:
|
||||
# Create a progress bar for overall download progress
|
||||
with tqdm(total=len(files), desc="Downloading model files") as progress:
|
||||
futures = [executor.submit(download_file_with_progress, file) for file in files]
|
||||
|
||||
for future in futures:
|
||||
result = future.result()
|
||||
results.append(result)
|
||||
logger.info(f"Finished processing {filename}")
|
||||
except Exception as exc:
|
||||
logger.error(f'{filename} generated an exception: {exc}')
|
||||
results.append((filename, False))
|
||||
if result:
|
||||
successful_downloads += 1
|
||||
progress.update(1)
|
||||
|
||||
# Report progress periodically
|
||||
if progress.n % 5 == 0 or progress.n == len(files):
|
||||
logger.info(f"Downloaded {progress.n}/{len(files)} files ({successful_downloads} successful)")
|
||||
|
||||
# Check the download results
|
||||
success_count = sum(1 for _, success in results if success)
|
||||
logger.info(f"Download completed. Successfully downloaded {success_count}/{len(files)} files.")
|
||||
# Handle any failed downloads
|
||||
if failed_files:
|
||||
logger.warning(f"Failed to download {len(failed_files)} files. First few: {failed_files[:5]}")
|
||||
|
||||
# If most files failed, there might be an issue with the model repository
|
||||
if len(failed_files) > len(files) * 0.5:
|
||||
logger.error(f"More than 50% of files failed to download. There might be an issue with the model repository.")
|
||||
raise RuntimeError("Too many files failed to download")
|
||||
|
||||
# If critical files failed (like model weights or config), warn specifically
|
||||
critical_patterns = ['model.safetensors', 'config.json', 'tokenizer.json']
|
||||
critical_failed = [f for f in failed_files if any(pattern in f for pattern in critical_patterns)]
|
||||
if critical_failed:
|
||||
logger.error(f"Failed to download critical files: {critical_failed}")
|
||||
raise RuntimeError(f"Failed to download critical model files: {', '.join(critical_failed)}")
|
||||
|
||||
# Record the download completion information
|
||||
try:
|
||||
|
@ -527,15 +822,14 @@ def save_hf_model(model_name="Qwen2.5-0.5B-Instruct", log_file_path=None) -> str
|
|||
raise
|
||||
except KeyboardInterrupt:
|
||||
logger.warning(f"Download interrupted by user for model: {model_name}")
|
||||
# Clean up partial downloads
|
||||
raise
|
||||
except Exception as e:
|
||||
# Log any errors that occur
|
||||
logger.error(f"Error downloading model: {str(e)}")
|
||||
logger.error(traceback.format_exc())
|
||||
raise
|
||||
|
||||
return save_path
|
||||
|
||||
|
||||
def format_timestr(utc_time_str):
|
||||
"""Formats a UTC time string to a more readable format.
|
||||
|
||||
|
|
|
@ -349,3 +349,56 @@ def get_document_embedding(document_id: int):
|
|||
return jsonify(
|
||||
APIResponse.error(message=f"Error getting document embedding: {str(e)}")
|
||||
)
|
||||
|
||||
|
||||
@document_bp.route("/documents/verify-embeddings", methods=["GET"])
|
||||
def verify_document_embeddings():
|
||||
"""Verify all document embeddings and return statistics"""
|
||||
try:
|
||||
verbose = request.args.get("verbose", "").lower() == "true"
|
||||
results = document_service.verify_document_embeddings(verbose=verbose)
|
||||
return jsonify(APIResponse.success(data=results))
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error verifying document embeddings: {str(e)}", exc_info=True)
|
||||
return jsonify(APIResponse.error(message=f"Error verifying document embeddings: {str(e)}"))
|
||||
|
||||
|
||||
@document_bp.route("/documents/repair", methods=["POST"])
|
||||
def repair_documents():
|
||||
"""Repair documents with missing analysis and embeddings"""
|
||||
try:
|
||||
# First, fix missing document analysis (summaries and insights)
|
||||
fixed_analysis_count = document_service.fix_missing_document_analysis()
|
||||
|
||||
# Get verification results after fixing analysis
|
||||
verification_results = document_service.verify_document_embeddings(verbose=False)
|
||||
|
||||
# Process documents with missing embeddings
|
||||
documents_fixed = 0
|
||||
for doc in document_service._repository.list():
|
||||
embedding = document_service.get_document_embedding(doc.id)
|
||||
if doc.raw_content and embedding is None:
|
||||
try:
|
||||
document_service.process_document_embedding(doc.id)
|
||||
# Also process chunk embeddings
|
||||
document_service.generate_document_chunk_embeddings(doc.id)
|
||||
documents_fixed += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing document {doc.id} embedding: {str(e)}")
|
||||
|
||||
# Get final verification results
|
||||
final_results = document_service.verify_document_embeddings(verbose=False)
|
||||
|
||||
return jsonify(APIResponse.success(
|
||||
data={
|
||||
"analysis_fixed": fixed_analysis_count,
|
||||
"embeddings_fixed": documents_fixed,
|
||||
"initial_state": verification_results,
|
||||
"final_state": final_results
|
||||
}
|
||||
))
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error repairing documents: {str(e)}", exc_info=True)
|
||||
return jsonify(APIResponse.error(message=f"Error repairing documents: {str(e)}"))
|
||||
|
|
|
@ -3,6 +3,8 @@ import logging
|
|||
import os
|
||||
import time
|
||||
import sys
|
||||
import torch # Add torch import for CUDA detection
|
||||
import traceback
|
||||
from dataclasses import asdict
|
||||
|
||||
from flask import Blueprint, jsonify, Response, request
|
||||
|
@ -352,6 +354,19 @@ def train2():
|
|||
model_name = data["model_name"]
|
||||
paths = get_model_paths(model_name)
|
||||
|
||||
# Get optional parameters with defaults
|
||||
learning_rate = data.get("learning_rate", 2e-4)
|
||||
num_train_epochs = data.get("number_of_epochs", 3)
|
||||
concurrency_threads = data.get("concurrency_threads", 2)
|
||||
data_synthesis_mode = data.get("data_synthesis_mode", "low")
|
||||
use_cuda = data.get("use_cuda", False)
|
||||
|
||||
# Convert use_cuda to string "True" or "False" for the shell script
|
||||
use_cuda_str = "True" if use_cuda else "False"
|
||||
|
||||
logger.info(f"Training configuration: learning_rate={learning_rate}, epochs={num_train_epochs}, "
|
||||
f"threads={concurrency_threads}, mode={data_synthesis_mode}, use_cuda={use_cuda} ({use_cuda_str})")
|
||||
|
||||
# Check if model exists
|
||||
if not os.path.exists(paths["base_path"]):
|
||||
return jsonify(APIResponse.error(
|
||||
|
@ -385,29 +400,61 @@ def train2():
|
|||
|
||||
script_path = os.path.join(os.getcwd(), "lpm_kernel/L2/train_for_user.sh")
|
||||
|
||||
# Build command arguments
|
||||
cmd_args = [
|
||||
"--lr", str(learning_rate),
|
||||
"--epochs", str(num_train_epochs),
|
||||
"--threads", str(concurrency_threads),
|
||||
"--mode", str(data_synthesis_mode),
|
||||
"--cuda", use_cuda_str # Use the properly formatted string
|
||||
]
|
||||
|
||||
# Start training
|
||||
import threading
|
||||
_training_thread = threading.Thread(
|
||||
target=start_training,
|
||||
args=(script_path, log_path),
|
||||
target=start_training_with_args,
|
||||
args=(script_path, log_path, cmd_args),
|
||||
daemon=True
|
||||
)
|
||||
_training_thread.start()
|
||||
|
||||
return jsonify(APIResponse.success(
|
||||
data={
|
||||
"status": "training_started",
|
||||
"model_name": model_name,
|
||||
"log_path": log_path,
|
||||
"personal_dir": paths["personal_dir"],
|
||||
"merged_dir": paths["merged_dir"]
|
||||
},
|
||||
message="Training task started"
|
||||
message="Training task started successfully"
|
||||
))
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to start training: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
return jsonify(APIResponse.error(message=error_msg, code=500))
|
||||
logger.error(f"Error starting training task: {str(e)}")
|
||||
traceback.print_exc()
|
||||
return jsonify(APIResponse.error(message=f"Failed to start training: {str(e)}"))
|
||||
|
||||
|
||||
def start_training_with_args(script_path: str, log_path: str, args: list) -> None:
|
||||
"""Start training with additional arguments"""
|
||||
global _training_process
|
||||
try:
|
||||
# Convert script path and args to a command
|
||||
cmd = [script_path] + args
|
||||
|
||||
# Use ScriptRunner to execute the script
|
||||
runner = ScriptRunner(log_path=log_path)
|
||||
_training_process = runner.execute_script(
|
||||
script_path=script_path,
|
||||
script_type="training",
|
||||
is_python=False, # This is a bash script
|
||||
args=args
|
||||
)
|
||||
|
||||
logger.info(f"Training process started with args: {args}, process: {_training_process}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to start training process: {str(e)}")
|
||||
_training_process = None
|
||||
raise
|
||||
|
||||
|
||||
@kernel2_bp.route("/merge_weights", methods=["POST"])
|
||||
|
@ -550,6 +597,9 @@ def start_llama_server():
|
|||
return jsonify(APIResponse.error(message="Missing required parameter: model_name", code=400))
|
||||
|
||||
model_name = data["model_name"]
|
||||
# Get optional use_gpu parameter with default value of True
|
||||
use_gpu = data.get("use_gpu", True)
|
||||
|
||||
paths = get_model_paths(model_name)
|
||||
gguf_path = os.path.join(paths["gguf_dir"], "model.gguf")
|
||||
|
||||
|
@ -564,53 +614,34 @@ def start_llama_server():
|
|||
server_executable = "llama-server"
|
||||
server_path = os.path.join(server_path, server_executable)
|
||||
|
||||
# Check if service and model file exist
|
||||
if not os.path.exists(server_path):
|
||||
return jsonify(APIResponse.error(message="llama-server executable file does not exist", code=400))
|
||||
# Check if model file exists
|
||||
if not os.path.exists(gguf_path):
|
||||
return jsonify(APIResponse.error(
|
||||
message=f"Model '{model_name}' GGUF file does not exist, please convert model first",
|
||||
code=400
|
||||
))
|
||||
|
||||
# Check if service is already running
|
||||
# Start the server using the LocalLLMService with GPU acceleration if requested
|
||||
success = local_llm_service.start_server(gguf_path, use_gpu=use_gpu)
|
||||
|
||||
if not success:
|
||||
return jsonify(APIResponse.error(message="Failed to start llama-server", code=500))
|
||||
|
||||
# Get updated service status
|
||||
status = local_llm_service.get_server_status()
|
||||
if status.is_running:
|
||||
return jsonify(
|
||||
APIResponse.error(
|
||||
message=f"llama-server is already running, PID: {status.process_info.pid}",
|
||||
code=400
|
||||
)
|
||||
)
|
||||
|
||||
# Build parameters
|
||||
args = [server_path, "-m", gguf_path, "--port", "8080"]
|
||||
|
||||
# Use thread to start service asynchronously
|
||||
def start_server():
|
||||
script_executor.execute(
|
||||
script_path=server_path,
|
||||
script_type="llama_server",
|
||||
args=args[1:], # Remove first parameter (executable file path)
|
||||
shell=False,
|
||||
)
|
||||
|
||||
# Start new thread to run service
|
||||
from threading import Thread
|
||||
|
||||
thread = Thread(target=start_server)
|
||||
thread.daemon = True
|
||||
thread.start()
|
||||
|
||||
# Return start status immediately
|
||||
|
||||
# Return success response with GPU info
|
||||
gpu_info = "with GPU acceleration" if use_gpu and torch.cuda.is_available() else "with CPU only"
|
||||
return jsonify(
|
||||
APIResponse.success(
|
||||
data={
|
||||
"model_name": model_name,
|
||||
"gguf_path": gguf_path,
|
||||
"status": "starting"
|
||||
"status": "running" if status.is_running else "starting",
|
||||
"use_gpu": use_gpu and torch.cuda.is_available(),
|
||||
"gpu_info": gpu_info
|
||||
},
|
||||
message="llama-server service is starting"
|
||||
message=f"llama-server service started {gpu_info}"
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -805,3 +836,31 @@ def chat(body: ChatRequest):
|
|||
if not getattr(body, 'stream', True): # Default to stream if attribute missing
|
||||
return jsonify(error_response), 500
|
||||
return local_llm_service.handle_stream_response(iter([error_response]))
|
||||
|
||||
|
||||
@kernel2_bp.route("/cuda/available", methods=["GET"])
|
||||
def check_cuda_available():
|
||||
"""Check if CUDA is available for model training/inference"""
|
||||
try:
|
||||
import torch
|
||||
cuda_available = torch.cuda.is_available()
|
||||
cuda_info = {}
|
||||
|
||||
if cuda_available:
|
||||
cuda_info = {
|
||||
"device_count": torch.cuda.device_count(),
|
||||
"current_device": torch.cuda.current_device(),
|
||||
"device_name": torch.cuda.get_device_name(0)
|
||||
}
|
||||
|
||||
return jsonify(APIResponse.success(
|
||||
data={
|
||||
"cuda_available": cuda_available,
|
||||
"cuda_info": cuda_info
|
||||
},
|
||||
message="CUDA availability check completed"
|
||||
))
|
||||
except Exception as e:
|
||||
error_msg = f"Error checking CUDA availability: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
return jsonify(APIResponse.error(message=error_msg, code=500))
|
||||
|
|
|
@ -24,6 +24,7 @@ def start_process():
|
|||
number_of_epochs: Number of training epochs (optional)
|
||||
concurrency_threads: Number of threads for concurrent processing (optional)
|
||||
data_synthesis_mode: Mode for data synthesis (optional)
|
||||
use_cuda: Whether to use CUDA for training (optional)
|
||||
|
||||
Includes the following steps:
|
||||
1. Health check
|
||||
|
@ -63,6 +64,7 @@ def start_process():
|
|||
number_of_epochs = data.get("number_of_epochs", None)
|
||||
concurrency_threads = data.get("concurrency_threads", None)
|
||||
data_synthesis_mode = data.get("data_synthesis_mode", None)
|
||||
use_cuda = data.get("use_cuda", False) # Default to False if not provided
|
||||
is_cot = data.get("is_cot", None)
|
||||
|
||||
# Log the received parameters
|
||||
|
@ -90,6 +92,7 @@ def start_process():
|
|||
"number_of_epochs": number_of_epochs,
|
||||
"concurrency_threads": concurrency_threads,
|
||||
"data_synthesis_mode": data_synthesis_mode,
|
||||
"use_cuda": use_cuda, # Make sure to include use_cuda parameter
|
||||
"is_cot": is_cot
|
||||
}
|
||||
|
||||
|
@ -113,6 +116,7 @@ def start_process():
|
|||
"number_of_epochs": number_of_epochs,
|
||||
"concurrency_threads": concurrency_threads,
|
||||
"data_synthesis_mode": data_synthesis_mode,
|
||||
"use_cuda": use_cuda, # Include in response
|
||||
"is_cot": is_cot
|
||||
}
|
||||
)
|
||||
|
|
|
@ -4,6 +4,7 @@ import logging
|
|||
import psutil
|
||||
import time
|
||||
import subprocess
|
||||
import torch # Add torch import for CUDA detection
|
||||
import threading
|
||||
import queue
|
||||
from typing import Iterator, Any, Optional, Generator, Dict
|
||||
|
@ -38,9 +39,16 @@ class LocalLLMService:
|
|||
)
|
||||
return self._client
|
||||
|
||||
def start_server(self, model_path: str) -> bool:
|
||||
def start_server(self, model_path: str, use_gpu: bool = True) -> bool:
|
||||
"""
|
||||
Start the llama-server service
|
||||
Start the llama-server service with GPU acceleration when available
|
||||
|
||||
Args:
|
||||
model_path: Path to the GGUF model file
|
||||
use_gpu: Whether to use GPU acceleration if available
|
||||
|
||||
Returns:
|
||||
bool: True if server started successfully, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Check if server is already running
|
||||
|
@ -49,27 +57,163 @@ class LocalLLMService:
|
|||
logger.info("LLama server is already running")
|
||||
return True
|
||||
|
||||
# Start server
|
||||
# Check for CUDA availability if GPU was requested
|
||||
cuda_available = torch.cuda.is_available() if use_gpu else False
|
||||
gpu_info = ""
|
||||
|
||||
if use_gpu and cuda_available:
|
||||
gpu_device = torch.cuda.current_device()
|
||||
gpu_info = f" using GPU: {torch.cuda.get_device_name(gpu_device)}"
|
||||
gpu_memory = torch.cuda.get_device_properties(gpu_device).total_memory / (1024**3)
|
||||
|
||||
logger.info(f"CUDA is available. Using GPU acceleration{gpu_info}")
|
||||
logger.info(f"CUDA device capabilities: {torch.cuda.get_device_capability(gpu_device)}")
|
||||
logger.info(f"CUDA memory: {gpu_memory:.2f} GB")
|
||||
|
||||
# Pre-initialize CUDA to speed up first inference
|
||||
logger.info("Pre-initializing CUDA context to speed up first inference")
|
||||
torch.cuda.init()
|
||||
torch.cuda.empty_cache()
|
||||
elif use_gpu and not cuda_available:
|
||||
logger.warning("CUDA was requested but is not available. Using CPU instead.")
|
||||
else:
|
||||
logger.info("Using CPU for inference (GPU not requested)")
|
||||
|
||||
# Check for GPU optimization marker
|
||||
gpu_optimized = False
|
||||
model_dir = os.path.dirname(model_path)
|
||||
gpu_marker_path = os.path.join(model_dir, "gpu_optimized.json")
|
||||
if os.path.exists(gpu_marker_path):
|
||||
try:
|
||||
with open(gpu_marker_path, 'r') as f:
|
||||
gpu_data = json.load(f)
|
||||
if gpu_data.get("gpu_optimized", False):
|
||||
gpu_optimized = True
|
||||
logger.info(f"Found GPU optimization marker created on {gpu_data.get('optimized_on', 'unknown date')}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Error reading GPU marker file: {e}")
|
||||
|
||||
# Get the correct path to the llama-server executable
|
||||
base_dir = os.getcwd()
|
||||
server_path = os.path.join(base_dir, "llama.cpp", "build", "bin", "llama-server")
|
||||
|
||||
# For Windows, add .exe extension if needed
|
||||
if os.name == 'nt' and not server_path.endswith('.exe'):
|
||||
server_path += '.exe'
|
||||
|
||||
# Verify executable exists
|
||||
if not os.path.exists(server_path):
|
||||
logger.error(f"llama-server executable not found at: {server_path}")
|
||||
return False
|
||||
|
||||
# Start server with optimal parameters for faster startup
|
||||
cmd = [
|
||||
"llama-server",
|
||||
server_path,
|
||||
"-m", model_path,
|
||||
"--host", "0.0.0.0",
|
||||
"--port", "8000"
|
||||
"--port", "8080",
|
||||
"--ctx-size", "2048", # Default context size (adjust based on needs)
|
||||
"--parallel", "2", # Enable request parallelism
|
||||
"--cont-batching" # Enable continuous batching
|
||||
]
|
||||
|
||||
# Set up environment with CUDA variables to ensure GPU detection
|
||||
env = os.environ.copy()
|
||||
|
||||
# Add GPU-related parameters if CUDA is available
|
||||
if cuda_available and use_gpu:
|
||||
# Force GPU usage with optimal parameters for faster loads
|
||||
cmd.extend([
|
||||
"--n-gpu-layers", "999", # Use all layers on GPU
|
||||
"--tensor-split", "0", # Use the first GPU for all operations
|
||||
"--main-gpu", "0", # Use GPU 0 as the primary device
|
||||
"--mlock" # Lock memory to prevent swapping during inference
|
||||
])
|
||||
|
||||
# Set CUDA environment variables to help with GPU detection
|
||||
env["CUDA_VISIBLE_DEVICES"] = "0" # Force using first GPU
|
||||
|
||||
# Ensure comprehensive library paths for CUDA
|
||||
cuda_lib_paths = [
|
||||
"/usr/local/cuda/lib64",
|
||||
"/usr/lib/cuda/lib64",
|
||||
"/usr/local/lib",
|
||||
"/usr/lib/x86_64-linux-gnu",
|
||||
"/usr/lib/wsl/lib" # For Windows WSL environments
|
||||
]
|
||||
|
||||
# Build a comprehensive LD_LIBRARY_PATH
|
||||
current_ld_path = env.get("LD_LIBRARY_PATH", "")
|
||||
for path in cuda_lib_paths:
|
||||
if os.path.exists(path) and path not in current_ld_path:
|
||||
current_ld_path = f"{path}:{current_ld_path}" if current_ld_path else path
|
||||
|
||||
env["LD_LIBRARY_PATH"] = current_ld_path
|
||||
logger.info(f"Setting LD_LIBRARY_PATH to: {current_ld_path}")
|
||||
|
||||
# If this is Windows, use different approach for CUDA libraries
|
||||
if os.name == 'nt':
|
||||
# Windows typically has CUDA in PATH already if installed
|
||||
logger.info("Windows system detected, using system CUDA libraries")
|
||||
else:
|
||||
# On Linux, try to find CUDA libraries in common locations
|
||||
for cuda_path in [
|
||||
# Common CUDA paths
|
||||
"/usr/local/cuda/lib64",
|
||||
"/usr/lib/cuda/lib64",
|
||||
"/usr/local/lib/python3.12/site-packages/nvidia/cuda_runtime/lib",
|
||||
"/usr/local/lib/python3.10/site-packages/nvidia/cuda_runtime/lib",
|
||||
]:
|
||||
if os.path.exists(cuda_path):
|
||||
# Add CUDA path to library path
|
||||
env["LD_LIBRARY_PATH"] = f"{cuda_path}:{env.get('LD_LIBRARY_PATH', '')}"
|
||||
env["CUDA_HOME"] = os.path.dirname(cuda_path)
|
||||
logger.info(f"Found CUDA at {cuda_path}, setting environment variables")
|
||||
break
|
||||
|
||||
# NOTE: CUDA support and rebuild should be handled at build/setup time (e.g., Docker build or setup script).
|
||||
# The runtime check and rebuild logic has been removed for efficiency and reliability.
|
||||
# Ensure llama.cpp is built with CUDA support before running the server if GPU is required.
|
||||
|
||||
# Pre-heat GPU to ensure faster initial response
|
||||
if torch.cuda.is_available():
|
||||
logger.info("Pre-warming GPU to reduce initial latency...")
|
||||
dummy_tensor = torch.zeros(1, 1).cuda()
|
||||
del dummy_tensor
|
||||
torch.cuda.synchronize()
|
||||
torch.cuda.empty_cache()
|
||||
logger.info("GPU warm-up complete")
|
||||
|
||||
logger.info("Using GPU acceleration for inference with optimized settings")
|
||||
else:
|
||||
# If GPU isn't available or supported, optimize for CPU
|
||||
cmd.extend([
|
||||
"--threads", str(max(1, os.cpu_count() - 1)), # Use all CPU cores except one
|
||||
])
|
||||
logger.info(f"Using CPU-only mode with {max(1, os.cpu_count() - 1)} threads")
|
||||
|
||||
logger.info(f"Starting llama-server with command: {' '.join(cmd)}")
|
||||
|
||||
process = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
universal_newlines=True
|
||||
universal_newlines=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for server to start
|
||||
time.sleep(2)
|
||||
# Wait for server to start (longer wait for GPU initialization)
|
||||
wait_time = 5 if cuda_available and use_gpu else 3
|
||||
logger.info(f"Waiting {wait_time} seconds for server to start...")
|
||||
time.sleep(wait_time)
|
||||
|
||||
# Check if process started successfully
|
||||
# Check if process is still running
|
||||
if process.poll() is None:
|
||||
logger.info("LLama server started successfully")
|
||||
# Log initialization success
|
||||
if cuda_available and use_gpu:
|
||||
logger.info(f"✅ LLama server started successfully with GPU acceleration{gpu_info}")
|
||||
else:
|
||||
logger.info("✅ LLama server started successfully in CPU-only mode")
|
||||
return True
|
||||
else:
|
||||
stdout, stderr = process.communicate()
|
||||
|
@ -151,10 +295,15 @@ class LocalLLMService:
|
|||
Returns: ServerStatus object
|
||||
"""
|
||||
try:
|
||||
base_dir = os.getcwd()
|
||||
server_path = os.path.join(base_dir, "llama.cpp", "build", "bin", "llama-server")
|
||||
server_exec_name = os.path.basename(server_path)
|
||||
|
||||
for proc in psutil.process_iter(["pid", "name", "cmdline"]):
|
||||
try:
|
||||
cmdline = proc.cmdline()
|
||||
if any("llama-server" in cmd for cmd in cmdline):
|
||||
# Check both for the executable name and the full path
|
||||
if any(server_exec_name in cmd for cmd in cmdline) or any("llama-server" in cmd for cmd in cmdline):
|
||||
with proc.oneshot():
|
||||
process_info = ProcessInfo(
|
||||
pid=proc.pid,
|
||||
|
|
|
@ -6,6 +6,7 @@ from .api.file_server.handler import FileServerHandler
|
|||
from .database.migration_manager import MigrationManager
|
||||
import os
|
||||
import atexit
|
||||
import subprocess
|
||||
|
||||
|
||||
def create_app():
|
||||
|
|
|
@ -7,6 +7,13 @@ from lpm_kernel.configs.logging import get_train_process_logger
|
|||
logger = get_train_process_logger()
|
||||
import lpm_kernel.common.strategy.classification as classification
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import json
|
||||
|
||||
class EmbeddingError(Exception):
|
||||
"""Custom exception class for embedding-related errors"""
|
||||
def __init__(self, message, original_error=None):
|
||||
super().__init__(message)
|
||||
self.original_error = original_error
|
||||
|
||||
class LLMClient:
|
||||
"""LLM client utility class"""
|
||||
|
@ -55,10 +62,8 @@ class LLMClient:
|
|||
|
||||
user_llm_config = self.user_llm_config_service.get_available_llm()
|
||||
if not user_llm_config:
|
||||
|
||||
raise Exception("No LLM configuration found")
|
||||
# Prepare request data
|
||||
|
||||
raise EmbeddingError("No LLM configuration found")
|
||||
|
||||
try:
|
||||
# Send request to embedding endpoint
|
||||
embeddings_array = classification.strategy_classification(user_llm_config, chunked_texts)
|
||||
|
@ -81,7 +86,25 @@ class LLMClient:
|
|||
return embeddings_array
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
raise Exception(f"Failed to get embeddings: {str(e)}")
|
||||
# Handle request errors
|
||||
error_msg = f"Request error getting embeddings: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
raise EmbeddingError(error_msg, e)
|
||||
except json.JSONDecodeError as e:
|
||||
# Handle JSON parsing errors
|
||||
error_msg = f"Invalid JSON response from embedding API: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
raise EmbeddingError(error_msg, e)
|
||||
except (KeyError, IndexError, ValueError) as e:
|
||||
# Handle response structure errors
|
||||
error_msg = f"Invalid response structure from embedding API: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
raise EmbeddingError(error_msg, e)
|
||||
except Exception as e:
|
||||
# Fallback for any other errors
|
||||
error_msg = f"Unexpected error getting embeddings: {str(e)}"
|
||||
logger.error(error_msg, exc_info=True)
|
||||
raise EmbeddingError(error_msg, e)
|
||||
|
||||
@property
|
||||
def chat_credentials(self):
|
||||
|
|
|
@ -148,6 +148,37 @@ class DocumentService:
|
|||
self._update_analyze_status_failed(doc.id)
|
||||
raise
|
||||
|
||||
def analyze_document(self, document_id: int) -> DocumentDTO:
|
||||
"""
|
||||
Analyze a single document by ID
|
||||
|
||||
Args:
|
||||
document_id (int): ID of document to analyze
|
||||
|
||||
Returns:
|
||||
DocumentDTO: The analyzed document
|
||||
|
||||
Raises:
|
||||
ValueError: If document not found
|
||||
Exception: If analysis fails
|
||||
"""
|
||||
try:
|
||||
# Get document
|
||||
document = self._repository.find_one(document_id)
|
||||
if not document:
|
||||
raise ValueError(f"Document not found with id: {document_id}")
|
||||
|
||||
# Perform analysis
|
||||
return self._analyze_document(document)
|
||||
|
||||
except ValueError as e:
|
||||
logger.error(f"Document {document_id} not found: {str(e)}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error analyzing document {document_id}: {str(e)}", exc_info=True)
|
||||
self._update_analyze_status_failed(document_id)
|
||||
raise
|
||||
|
||||
def _update_analyze_status_failed(self, doc_id: int) -> None:
|
||||
"""update status as failed"""
|
||||
try:
|
||||
|
@ -593,6 +624,120 @@ class DocumentService:
|
|||
logger.error(f"Error deleting file: {str(e)}", exc_info=True)
|
||||
raise
|
||||
|
||||
def fix_missing_document_analysis(self) -> int:
|
||||
"""Fix documents with missing insights or summaries
|
||||
|
||||
Returns:
|
||||
int: Number of documents fixed
|
||||
"""
|
||||
try:
|
||||
# Find all documents that have analysis issues
|
||||
docs = self._repository.list()
|
||||
fixed_count = 0
|
||||
|
||||
for doc in docs:
|
||||
needs_fixing = False
|
||||
|
||||
# Check if document needs analysis
|
||||
if not doc.analyze_status or doc.analyze_status != ProcessStatus.SUCCESS:
|
||||
needs_fixing = True
|
||||
logger.info(f"Document {doc.id} needs analysis (status: {doc.analyze_status})")
|
||||
|
||||
# Check if document has missing insights or summaries
|
||||
elif not doc.insight or not doc.summary:
|
||||
needs_fixing = True
|
||||
logger.info(f"Document {doc.id} has missing insight or summary")
|
||||
|
||||
# Process documents that need fixing
|
||||
if needs_fixing:
|
||||
try:
|
||||
# Process document analysis
|
||||
self.analyze_document(doc.id)
|
||||
fixed_count += 1
|
||||
logger.info(f"Fixed document {doc.id} analysis")
|
||||
except Exception as e:
|
||||
logger.error(f"Error fixing document {doc.id} analysis: {str(e)}")
|
||||
|
||||
logger.info(f"Fixed {fixed_count} documents with missing analysis")
|
||||
return fixed_count
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in fix_missing_document_analysis: {str(e)}")
|
||||
raise FileProcessingError(f"Failed to fix document analysis: {str(e)}")
|
||||
|
||||
def verify_document_embeddings(self, verbose=True) -> Dict:
|
||||
"""
|
||||
Verify all document embeddings and return statistics
|
||||
|
||||
Args:
|
||||
verbose (bool): Whether to log detailed information
|
||||
|
||||
Returns:
|
||||
Dict: Statistics about document embeddings
|
||||
"""
|
||||
try:
|
||||
docs = self._repository.list()
|
||||
results = {
|
||||
"total_documents": len(docs),
|
||||
"documents_with_embedding": 0,
|
||||
"documents_without_embedding": 0,
|
||||
"documents_with_content": 0,
|
||||
"documents_without_content": 0,
|
||||
"documents_with_summary": 0,
|
||||
"documents_without_summary": 0,
|
||||
"documents_with_insight": 0,
|
||||
"documents_without_insight": 0,
|
||||
"documents_needing_repair": 0,
|
||||
}
|
||||
|
||||
documents_needing_repair = []
|
||||
|
||||
for doc in docs:
|
||||
# Check if document has content
|
||||
if doc.raw_content:
|
||||
results["documents_with_content"] += 1
|
||||
else:
|
||||
results["documents_without_content"] += 1
|
||||
|
||||
# Check if document has summary
|
||||
if doc.summary:
|
||||
results["documents_with_summary"] += 1
|
||||
else:
|
||||
results["documents_without_summary"] += 1
|
||||
|
||||
# Check if document has insight
|
||||
if doc.insight:
|
||||
results["documents_with_insight"] += 1
|
||||
else:
|
||||
results["documents_without_insight"] += 1
|
||||
|
||||
# Check if embeddings exist in ChromaDB
|
||||
embedding = self.get_document_embedding(doc.id)
|
||||
if embedding is not None:
|
||||
results["documents_with_embedding"] += 1
|
||||
if verbose:
|
||||
logger.info(f"Document {doc.id}: '{doc.name}' has embedding of dimension {len(embedding)}")
|
||||
else:
|
||||
results["documents_without_embedding"] += 1
|
||||
if verbose:
|
||||
logger.warning(f"Document {doc.id}: '{doc.name}' missing embedding")
|
||||
|
||||
# Check if document needs repair (has content but missing embedding or analysis)
|
||||
if doc.raw_content and (embedding is None or not doc.summary or not doc.insight):
|
||||
documents_needing_repair.append(doc.id)
|
||||
results["documents_needing_repair"] += 1
|
||||
|
||||
# Log statistics
|
||||
logger.info(f"Document embedding verification results: {results}")
|
||||
if documents_needing_repair and verbose:
|
||||
logger.info(f"Documents needing repair: {documents_needing_repair}")
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error verifying document embeddings: {str(e)}", exc_info=True)
|
||||
raise
|
||||
|
||||
|
||||
# create service
|
||||
document_service = DocumentService()
|
||||
|
|
|
@ -418,4 +418,4 @@ class EmbeddingService:
|
|||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error searching similar chunks: {str(e)}")
|
||||
raise
|
||||
raise
|
|
@ -23,6 +23,7 @@ class TrainingParamsManager:
|
|||
"number_of_epochs": 3,
|
||||
"concurrency_threads": 2,
|
||||
"data_synthesis_mode": "low",
|
||||
"use_cuda": False, # Default to using CUDA when available
|
||||
"is_cot": False
|
||||
}
|
||||
|
||||
|
|
|
@ -42,7 +42,7 @@ class TrainProcessService:
|
|||
|
||||
_instance = None
|
||||
_initialized = False
|
||||
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
if cls._instance is None:
|
||||
cls._instance = super().__new__(cls)
|
||||
|
@ -633,6 +633,7 @@ class TrainProcessService:
|
|||
num_train_epochs = training_params.get("number_of_epochs")
|
||||
concurrency_threads = training_params.get("concurrency_threads")
|
||||
data_synthesis_mode = training_params.get("data_synthesis_mode")
|
||||
use_cuda = training_params.get("use_cuda", False)
|
||||
is_cot = training_params.get("is_cot", False)
|
||||
|
||||
# Log training parameters
|
||||
|
@ -641,6 +642,8 @@ class TrainProcessService:
|
|||
logger.info(f" Number of epochs: {num_train_epochs}")
|
||||
logger.info(f" Concurrency threads: {concurrency_threads}")
|
||||
logger.info(f" Data synthesis mode: {data_synthesis_mode}")
|
||||
logger.info(f" Use CUDA: {use_cuda}")
|
||||
logger.info(f" Is CoT: {is_cot}")
|
||||
|
||||
# Prepare arguments for the script
|
||||
# Build command line arguments, need to include script path as the first parameter
|
||||
|
@ -650,6 +653,7 @@ class TrainProcessService:
|
|||
"--epochs", str(num_train_epochs),
|
||||
"--threads", str(concurrency_threads),
|
||||
"--mode", str(data_synthesis_mode),
|
||||
"--cuda", str(use_cuda),
|
||||
"--is_cot", str(is_cot)
|
||||
]
|
||||
|
||||
|
|
|
@ -0,0 +1,69 @@
|
|||
@echo off
|
||||
REM Script to prompt user for CUDA support preference
|
||||
|
||||
echo === CUDA Support Selection ===
|
||||
echo.
|
||||
echo Do you want to build with NVIDIA GPU (CUDA) support?
|
||||
echo This requires an NVIDIA GPU and proper NVIDIA Docker runtime configuration.
|
||||
echo.
|
||||
set /p choice="Build with CUDA support? (y/n): "
|
||||
|
||||
if /i "%choice%"=="y" goto cuda
|
||||
if /i "%choice%"=="yes" goto cuda
|
||||
goto nocuda
|
||||
|
||||
:cuda
|
||||
echo Selected: Build WITH CUDA support
|
||||
|
||||
REM Create or update .env file with the Dockerfile selection
|
||||
if exist .env (
|
||||
REM Check if variable already exists in file
|
||||
findstr /c:"DOCKER_BACKEND_DOCKERFILE" .env >nul
|
||||
if %ERRORLEVEL% EQU 0 (
|
||||
REM Update existing variable
|
||||
powershell -Command "(Get-Content .env) -replace '^DOCKER_BACKEND_DOCKERFILE=.*', 'DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda' | Set-Content .env"
|
||||
) else (
|
||||
REM Append to file with newline before
|
||||
echo.>> .env
|
||||
echo DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda>> .env
|
||||
)
|
||||
) else (
|
||||
REM Create new file
|
||||
echo DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda> .env
|
||||
)
|
||||
|
||||
REM Create a flag file to indicate GPU use
|
||||
echo GPU > .gpu_selected
|
||||
|
||||
echo Environment set to build with CUDA support
|
||||
goto end
|
||||
|
||||
:nocuda
|
||||
echo Selected: Build WITHOUT CUDA support (CPU only)
|
||||
|
||||
REM Create or update .env file with the Dockerfile selection
|
||||
if exist .env (
|
||||
REM Check if variable already exists in file
|
||||
findstr /c:"DOCKER_BACKEND_DOCKERFILE" .env >nul
|
||||
if %ERRORLEVEL% EQU 0 (
|
||||
REM Update existing variable
|
||||
powershell -Command "(Get-Content .env) -replace '^DOCKER_BACKEND_DOCKERFILE=.*', 'DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend' | Set-Content .env"
|
||||
) else (
|
||||
REM Append to file with newline before
|
||||
echo.>> .env
|
||||
echo DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend>> .env
|
||||
)
|
||||
) else (
|
||||
REM Create new file
|
||||
echo DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend> .env
|
||||
)
|
||||
|
||||
REM Remove any GPU flag file if it exists
|
||||
if exist .gpu_selected (
|
||||
del .gpu_selected
|
||||
)
|
||||
|
||||
echo Environment set to build without CUDA support
|
||||
|
||||
:end
|
||||
echo === CUDA Selection Complete ===
|
|
@ -0,0 +1,62 @@
|
|||
#!/bin/bash
|
||||
# Script to prompt user for CUDA support preference and directly build with the appropriate Dockerfile
|
||||
|
||||
echo "=== CUDA Support Selection ==="
|
||||
echo ""
|
||||
echo "Do you want to build with NVIDIA GPU (CUDA) support?"
|
||||
echo "This requires an NVIDIA GPU and proper NVIDIA Docker runtime configuration."
|
||||
echo ""
|
||||
read -p "Build with CUDA support? (y/n): " choice
|
||||
|
||||
case "$choice" in
|
||||
y|Y|yes|YES|Yes )
|
||||
echo "Selected: Build WITH CUDA support"
|
||||
|
||||
# Create or update .env file with the Dockerfile selection
|
||||
if [ -f .env ]; then
|
||||
# Update existing file
|
||||
if grep -q "DOCKER_BACKEND_DOCKERFILE" .env; then
|
||||
sed -i 's/^DOCKER_BACKEND_DOCKERFILE=.*/DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda/' .env
|
||||
else
|
||||
# Add a newline before appending new content
|
||||
echo "" >> .env
|
||||
echo "DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda" >> .env
|
||||
fi
|
||||
else
|
||||
# Create new file
|
||||
echo "DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend.cuda" > .env
|
||||
fi
|
||||
|
||||
# Create a flag file to indicate GPU use
|
||||
echo "GPU" > .gpu_selected
|
||||
|
||||
echo "Environment set to build with CUDA support"
|
||||
;;
|
||||
* )
|
||||
echo "Selected: Build WITHOUT CUDA support (CPU only)"
|
||||
|
||||
# Create or update .env file with the Dockerfile selection
|
||||
if [ -f .env ]; then
|
||||
# Update existing file
|
||||
if grep -q "DOCKER_BACKEND_DOCKERFILE" .env; then
|
||||
sed -i 's/^DOCKER_BACKEND_DOCKERFILE=.*/DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend/' .env
|
||||
else
|
||||
# Add a newline before appending new content
|
||||
echo "" >> .env
|
||||
echo "DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend" >> .env
|
||||
fi
|
||||
else
|
||||
# Create new file
|
||||
echo "DOCKER_BACKEND_DOCKERFILE=Dockerfile.backend" > .env
|
||||
fi
|
||||
|
||||
# Remove any GPU flag file if it exists
|
||||
if [ -f .gpu_selected ]; then
|
||||
rm .gpu_selected
|
||||
fi
|
||||
|
||||
echo "Environment set to build without CUDA support"
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "=== CUDA Selection Complete ==="
|
Loading…
Reference in New Issue