diff --git a/.github/workflows/installation-validation.yml b/.github/workflows/installation-validation.yml new file mode 100644 index 0000000..9a82dd8 --- /dev/null +++ b/.github/workflows/installation-validation.yml @@ -0,0 +1,128 @@ +name: Installation Validation + +on: + push: + branches: [ none ] + paths: + - 'examples/env/scripts/setup-pip-deps.sh' + - 'docs/tutorial/installation.md' + - 'examples/env/validate_installation.py' + - 'setup.py' + - 'requirements*.txt' + - '.github/workflows/installation-validation.yml' + pull_request: + branches: [ none ] + paths: + - 'examples/env/scripts/setup-pip-deps.sh' + - 'docs/tutorial/installation.md' + - 'examples/env/validate_installation.py' + - 'setup.py' + - 'requirements*.txt' + - '.github/workflows/installation-validation.yml' + workflow_dispatch: + +jobs: + validate-installation: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + lfs: true + + - name: Set up SSH key + run: | + mkdir -p ~/.ssh + echo "${{ secrets.REMOTE_SSH_KEY }}" > ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + ssh-keyscan -p 8107 101.6.96.205 >> ~/.ssh/known_hosts + + - name: Synchronize repository to remote machine + run: | + # Use rsync to synchronize repository to remote machine + rsync -avz --delete \ + --exclude='.git' \ + --exclude='__pycache__' \ + --exclude='*.pyc' \ + --exclude='*.pyo' \ + --exclude='*.egg-info' \ + --exclude='build/' \ + --exclude='dist/' \ + --exclude='.pytest_cache' \ + --exclude='.coverage' \ + --exclude='*.so' \ + --exclude='*.dylib' \ + --exclude='node_modules/' \ + --exclude='.env' \ + --exclude='.venv' \ + -e 'ssh -p 8107' . fuwei@101.6.96.205:/tmp/areal-validation/ + + - name: Run installation validation on remote machine + run: | + ssh -p 8107 fuwei@101.6.96.205 << 'EOF' + set -e + + # Navigate to the synchronized repository + cd /tmp/areal-validation + + # Create persistent pip cache directory + mkdir -p /tmp/pip-cache + + # Generate a unique container name + CONTAINER_NAME="areal-validation-$(date +%s)" + + # Stop and remove any existing container with the same name + docker stop $CONTAINER_NAME 2>/dev/null || true + docker rm $CONTAINER_NAME 2>/dev/null || true + + echo "=== Starting Docker container ===" + # Launch Docker container with NVIDIA PyTorch image + docker run -d \ + --name $CONTAINER_NAME \ + --gpus all \ + --shm-size=8g \ + -v $(pwd):/workspace \ + -v /tmp/pip-cache:/root/.cache/pip \ + -w /workspace \ + nvcr.io/nvidia/pytorch:25.01-py3 \ + sleep infinity + + echo "=== Verifying CUDA environment in container ===" + docker exec $CONTAINER_NAME nvidia-smi + docker exec $CONTAINER_NAME nvcc --version + + echo "=== Verifying workspace contents ===" + docker exec $CONTAINER_NAME pwd + docker exec $CONTAINER_NAME ls -la /workspace + docker exec $CONTAINER_NAME ls -la /workspace/examples/env/ || echo "examples/env directory not found" + + echo "=== Checking pip cache before installation ===" + du -sh /tmp/pip-cache 2>/dev/null || echo "Cache directory empty" + + echo "=== Installing dependencies ===" + docker exec $CONTAINER_NAME bash -c " + python -m pip install --upgrade pip + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + pip config unset global.extra-index-url + # Run the installation script + bash examples/env/scripts/setup-pip-deps.sh + python examples/env/validate_installation.py + " + + echo "=== Checking pip cache after installation ===" + du -sh /tmp/pip-cache 2>/dev/null || echo "Cache directory still empty" + + echo "=== Installation validation completed successfully ===" + + # Cleanup + docker stop $CONTAINER_NAME + docker rm $CONTAINER_NAME + cd ~ + rm -rf /tmp/areal-validation + EOF + + - name: Cleanup SSH key + if: always() + run: | + rm -f ~/.ssh/id_rsa \ No newline at end of file diff --git a/examples/env/scripts/setup-pip-deps.sh b/examples/env/scripts/setup-pip-deps.sh index 65ee55b..8fa203b 100644 --- a/examples/env/scripts/setup-pip-deps.sh +++ b/examples/env/scripts/setup-pip-deps.sh @@ -1,11 +1,12 @@ #!/bin/bash # basic dependencies pip install -U pip -pip uninstall deepspeed flash-attn pynvml cugraph-dgl dask-cuda cugraph-service-server raft-dask cugraph cuml cugraph-pyg -y +pip uninstall torch deepspeed flash-attn pynvml cugraph-dgl dask-cuda cugraph-service-server raft-dask cugraph cuml cugraph-pyg -y +pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 pip install "sglang[all]==0.4.6.post4" pip install megatron-core==0.11.0 nvidia-ml-py pip install git+https://github.com/garrett4wade/cugae --no-build-isolation --verbose -pip install flash-attn --no-build-isolation +pip install "flash-attn<=2.7.3" --no-build-isolation # Package used for calculating math reward pip install -e evaluation/latex2sympy diff --git a/examples/env/validate_installation.py b/examples/env/validate_installation.py new file mode 100644 index 0000000..61ef6f7 --- /dev/null +++ b/examples/env/validate_installation.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python3 +""" +Installation Validation Script for AReaL + +This script validates that all critical dependencies are properly installed +and can be imported successfully. It's designed to be run in CI to validate +the installation procedure described in docs/tutorial/installation.md. +""" + +import importlib +import sys +import traceback +import warnings +from importlib.metadata import version as get_version +from typing import Any, Dict, List, Optional + +from packaging.version import Version + + +class InstallationValidator: + def __init__(self): + self.results = {} + self.critical_failures = [] + self.warnings = [] + + def test_import(self, module_name: str, required: bool = True, + test_func: Optional[callable] = None) -> bool: + """Test importing a module and optionally run additional tests.""" + try: + module = importlib.import_module(module_name) + + # Run additional test if provided + if test_func: + test_func(module) + + self.results[module_name] = {"status": "SUCCESS", "error": None} + print(f"✓ {module_name}") + return True + + except ImportError as e: + self.results[module_name] = {"status": "FAILED", "error": str(e)} + if required: + self.critical_failures.append(f"{module_name}: {str(e)}") + print(f"✗ {module_name} (CRITICAL): {str(e)}") + else: + self.warnings.append(f"{module_name}: {str(e)}") + print(f"⚠ {module_name} (OPTIONAL): {str(e)}") + return False + + except Exception as e: + self.results[module_name] = {"status": "ERROR", "error": str(e)} + if required: + self.critical_failures.append(f"{module_name}: {str(e)}") + print(f"✗ {module_name} (CRITICAL ERROR): {str(e)}") + else: + self.warnings.append(f"{module_name}: {str(e)}") + print(f"⚠ {module_name} (OPTIONAL ERROR): {str(e)}") + return False + + def test_torch_cuda(self, torch_module): + """Test PyTorch CUDA availability.""" + if not torch_module.cuda.is_available(): + raise RuntimeError("CUDA is not available in PyTorch") + print(f" - CUDA devices: {torch_module.cuda.device_count()}") + print(f" - CUDA version: {torch_module.version.cuda}") + + def test_flash_attn_functionality(self, flash_attn_module): + """Test flash attention functionality.""" + # Try to import key functions + from flash_attn import flash_attn_func, flash_attn_varlen_func + print(" - Flash attention functions imported successfully") + + def test_vllm_functionality(self, vllm_module): + """Test vLLM basic functionality.""" + from vllm import LLM, SamplingParams + print(" - vLLM core classes imported successfully") + + def test_sglang_functionality(self, sglang_module): + """Test SGLang basic functionality.""" + # Basic import test is sufficient for CI + import sgl_kernel + from sglang import launch_server + assert Version(get_version("sglang")) == Version("0.4.6.post4") + print(" - SGLang imported successfully") + + def test_transformers(self, transformers_module): + assert Version(get_version("transformers")) == Version("4.51.1") + print(" - transformers imported successfully") + + def validate_critical_dependencies(self): + """Validate critical dependencies that must be present.""" + print("\n=== Testing Critical Dependencies ===") + + # Core ML frameworks + self.test_import("torch", required=True, test_func=self.test_torch_cuda) + self.test_import("transformers", required=True, test_func=self.test_transformers) + + # Flash attention - critical for performance + self.test_import("flash_attn", required=True, test_func=self.test_flash_attn_functionality) + self.test_import("cugae", required=True) + # Inference engines + self.test_import("sglang", required=True, test_func=self.test_sglang_functionality) + + # Distributed computing + self.test_import("ray", required=True) + + # Scientific computing + self.test_import("numpy", required=True) + self.test_import("scipy", required=True) + + # Configuration management + self.test_import("hydra", required=True) + self.test_import("omegaconf", required=True) + + # Data processing + self.test_import("datasets", required=True) + self.test_import("pandas", required=True) + self.test_import("einops", required=True) + + # Monitoring and logging + self.test_import("wandb", required=True) + self.test_import("pynvml", required=True) + + # Networking + self.test_import("aiohttp", required=True) + self.test_import("fastapi", required=True) + self.test_import("uvicorn", required=True) + + # Math libraries (for evaluation) + self.test_import("sympy", required=True) + self.test_import("latex2sympy2", required=True) + + def validate_optional_dependencies(self): + """Validate optional dependencies.""" + print("\n=== Testing Optional Dependencies ===") + + # CUDA extensions (may not be available in all environments) + self.test_import("vllm", required=False, test_func=self.test_vllm_functionality) + self.test_import("grouped_gemm", required=False) + self.test_import("flashattn_hopper", required=False) + + # Optional utilities + self.test_import("tensorboardx", required=False) + self.test_import("swanlab", required=False) + self.test_import("matplotlib", required=False) + self.test_import("seaborn", required=False) + self.test_import("numba", required=False) + self.test_import("nltk", required=False) + + def validate_cuda_extensions(self): + """Validate CUDA-specific functionality.""" + print("\n=== Testing CUDA Extensions ===") + + try: + import torch + if torch.cuda.is_available(): + # Test basic CUDA tensor operations + device = torch.device("cuda:0") + x = torch.randn(10, device=device) + y = torch.randn(10, device=device) + z = x + y + print("✓ Basic CUDA operations working") + + # Test flash attention if available + try: + from flash_attn import flash_attn_func + + # Create small test tensors + batch_size, seq_len, num_heads, head_dim = 1, 32, 4, 64 + q = torch.randn(batch_size, seq_len, num_heads, head_dim, + device=device, dtype=torch.float16) + k = torch.randn(batch_size, seq_len, num_heads, head_dim, + device=device, dtype=torch.float16) + v = torch.randn(batch_size, seq_len, num_heads, head_dim, + device=device, dtype=torch.float16) + + # Test flash attention call + out = flash_attn_func(q, k, v) + print("✓ Flash attention CUDA operations working") + + except Exception as e: + print(f"⚠ Flash attention CUDA test failed: {e}") + + else: + print("⚠ CUDA not available - skipping CUDA extension tests") + + except Exception as e: + print(f"✗ CUDA extension validation failed: {e}") + + def run_validation(self): + """Run complete validation suite.""" + print("AReaL Installation Validation") + print("=" * 50) + + self.validate_critical_dependencies() + self.validate_optional_dependencies() + self.validate_cuda_extensions() + + # Print summary + print("\n" + "=" * 50) + print("VALIDATION SUMMARY") + print("=" * 50) + + total_tests = len(self.results) + successful_tests = sum(1 for r in self.results.values() if r["status"] == "SUCCESS") + failed_tests = total_tests - successful_tests + + print(f"Total tests: {total_tests}") + print(f"Successful: {successful_tests}") + print(f"Failed: {failed_tests}") + + if self.critical_failures: + print(f"\n🚨 CRITICAL FAILURES ({len(self.critical_failures)}):") + for failure in self.critical_failures: + print(f" - {failure}") + + if self.warnings: + print(f"\n⚠️ WARNINGS ({len(self.warnings)}):") + for warning in self.warnings: + print(f" - {warning}") + + # Determine overall result + if self.critical_failures: + print(f"\n❌ INSTALLATION VALIDATION FAILED") + print("Please check the critical failures above and ensure all required") + print("dependencies are properly installed according to the installation guide.") + return False + else: + print(f"\n✅ INSTALLATION VALIDATION PASSED") + if self.warnings: + print("Note: Some optional dependencies failed but this won't affect") + print("core functionality.") + return True + +def main(): + """Main entry point.""" + validator = InstallationValidator() + success = validator.run_validation() + sys.exit(0 if success else 1) + +if __name__ == "__main__": + main() \ No newline at end of file