[FIX] Fix the flash-attn version to 2.7.3 (#114)

* . * degenerate to 2.7.3 * . * . * update ci * update ci * update ci * update CI * update to tsinghua mirror * update ci * . * . * .
2025-06-23 15:46:17 +08:00 · 2025-06-23 15:46:17 +08:00 · 3642cce2fc
parent 1ec1399f19
commit 3642cce2fc
3 changed files with 373 additions and 2 deletions
--- a/.github/workflows/installation-validation.yml
+++ b/.github/workflows/installation-validation.yml
@ -0,0 +1,128 @@
+name: Installation Validation
+
+on:
+  push:
+    branches: [ none ]
+    paths:
+      - 'examples/env/scripts/setup-pip-deps.sh'
+      - 'docs/tutorial/installation.md'
+      - 'examples/env/validate_installation.py'
+      - 'setup.py'
+      - 'requirements*.txt'
+      - '.github/workflows/installation-validation.yml'
+  pull_request:
+    branches: [ none ]
+    paths:
+      - 'examples/env/scripts/setup-pip-deps.sh'
+      - 'docs/tutorial/installation.md'
+      - 'examples/env/validate_installation.py'
+      - 'setup.py'
+      - 'requirements*.txt'
+      - '.github/workflows/installation-validation.yml'
+  workflow_dispatch:
+
+jobs:
+  validate-installation:
+    runs-on: ubuntu-latest
+    
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+          
+      - name: Set up SSH key
+        run: |
+          mkdir -p ~/.ssh
+          echo "${{ secrets.REMOTE_SSH_KEY }}" > ~/.ssh/id_rsa
+          chmod 600 ~/.ssh/id_rsa
+          ssh-keyscan -p 8107 101.6.96.205 >> ~/.ssh/known_hosts
+          
+      - name: Synchronize repository to remote machine
+        run: |
+          # Use rsync to synchronize repository to remote machine
+          rsync -avz --delete \
+            --exclude='.git' \
+            --exclude='__pycache__' \
+            --exclude='*.pyc' \
+            --exclude='*.pyo' \
+            --exclude='*.egg-info' \
+            --exclude='build/' \
+            --exclude='dist/' \
+            --exclude='.pytest_cache' \
+            --exclude='.coverage' \
+            --exclude='*.so' \
+            --exclude='*.dylib' \
+            --exclude='node_modules/' \
+            --exclude='.env' \
+            --exclude='.venv' \
+            -e 'ssh -p 8107' . fuwei@101.6.96.205:/tmp/areal-validation/
+          
+      - name: Run installation validation on remote machine
+        run: |
+          ssh -p 8107 fuwei@101.6.96.205 << 'EOF'
+            set -e
+            
+            # Navigate to the synchronized repository
+            cd /tmp/areal-validation
+            
+            # Create persistent pip cache directory
+            mkdir -p /tmp/pip-cache
+            
+            # Generate a unique container name
+            CONTAINER_NAME="areal-validation-$(date +%s)"
+            
+            # Stop and remove any existing container with the same name
+            docker stop $CONTAINER_NAME 2>/dev/null || true
+            docker rm $CONTAINER_NAME 2>/dev/null || true
+            
+            echo "=== Starting Docker container ==="
+            # Launch Docker container with NVIDIA PyTorch image
+            docker run -d \
+              --name $CONTAINER_NAME \
+              --gpus all \
+              --shm-size=8g \
+              -v $(pwd):/workspace \
+              -v /tmp/pip-cache:/root/.cache/pip \
+              -w /workspace \
+              nvcr.io/nvidia/pytorch:25.01-py3 \
+              sleep infinity
+              
+            echo "=== Verifying CUDA environment in container ==="
+            docker exec $CONTAINER_NAME nvidia-smi
+            docker exec $CONTAINER_NAME nvcc --version
+            
+            echo "=== Verifying workspace contents ==="
+            docker exec $CONTAINER_NAME pwd
+            docker exec $CONTAINER_NAME ls -la /workspace
+            docker exec $CONTAINER_NAME ls -la /workspace/examples/env/ || echo "examples/env directory not found"
+            
+            echo "=== Checking pip cache before installation ==="
+            du -sh /tmp/pip-cache 2>/dev/null || echo "Cache directory empty"
+            
+            echo "=== Installing dependencies ==="
+            docker exec $CONTAINER_NAME bash -c "
+              python -m pip install --upgrade pip
+              pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+              pip config unset global.extra-index-url
+              # Run the installation script
+              bash examples/env/scripts/setup-pip-deps.sh
+              python examples/env/validate_installation.py
+            "
+            
+            echo "=== Checking pip cache after installation ==="
+            du -sh /tmp/pip-cache 2>/dev/null || echo "Cache directory still empty"
+            
+            echo "=== Installation validation completed successfully ==="
+            
+            # Cleanup
+            docker stop $CONTAINER_NAME
+            docker rm $CONTAINER_NAME
+            cd ~
+            rm -rf /tmp/areal-validation
+          EOF
+          
+      - name: Cleanup SSH key
+        if: always()
+        run: |
+          rm -f ~/.ssh/id_rsa
--- a/examples/env/scripts/setup-pip-deps.sh
+++ b/examples/env/scripts/setup-pip-deps.sh
@ -1,11 +1,12 @@
 #!/bin/bash
 # basic dependencies
 pip install -U pip
-pip uninstall deepspeed flash-attn pynvml cugraph-dgl dask-cuda cugraph-service-server raft-dask cugraph cuml cugraph-pyg -y
+pip uninstall torch deepspeed flash-attn pynvml cugraph-dgl dask-cuda cugraph-service-server raft-dask cugraph cuml cugraph-pyg -y
+pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0
 pip install "sglang[all]==0.4.6.post4" 
 pip install megatron-core==0.11.0 nvidia-ml-py
 pip install git+https://github.com/garrett4wade/cugae --no-build-isolation --verbose
-pip install flash-attn --no-build-isolation
+pip install "flash-attn<=2.7.3" --no-build-isolation

 # Package used for calculating math reward
 pip install -e evaluation/latex2sympy
--- a/examples/env/validate_installation.py
+++ b/examples/env/validate_installation.py
@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+"""
+Installation Validation Script for AReaL
+
+This script validates that all critical dependencies are properly installed
+and can be imported successfully. It's designed to be run in CI to validate
+the installation procedure described in docs/tutorial/installation.md.
+"""
+
+import importlib
+import sys
+import traceback
+import warnings
+from importlib.metadata import version as get_version
+from typing import Any, Dict, List, Optional
+
+from packaging.version import Version
+
+
+class InstallationValidator:
+    def __init__(self):
+        self.results = {}
+        self.critical_failures = []
+        self.warnings = []
+
+    def test_import(self, module_name: str, required: bool = True, 
+                   test_func: Optional[callable] = None) -> bool:
+        """Test importing a module and optionally run additional tests."""
+        try:
+            module = importlib.import_module(module_name)
+            
+            # Run additional test if provided
+            if test_func:
+                test_func(module)
+                
+            self.results[module_name] = {"status": "SUCCESS", "error": None}
+            print(f"✓ {module_name}")
+            return True
+            
+        except ImportError as e:
+            self.results[module_name] = {"status": "FAILED", "error": str(e)}
+            if required:
+                self.critical_failures.append(f"{module_name}: {str(e)}")
+                print(f"✗ {module_name} (CRITICAL): {str(e)}")
+            else:
+                self.warnings.append(f"{module_name}: {str(e)}")
+                print(f"⚠ {module_name} (OPTIONAL): {str(e)}")
+            return False
+            
+        except Exception as e:
+            self.results[module_name] = {"status": "ERROR", "error": str(e)}
+            if required:
+                self.critical_failures.append(f"{module_name}: {str(e)}")
+                print(f"✗ {module_name} (CRITICAL ERROR): {str(e)}")
+            else:
+                self.warnings.append(f"{module_name}: {str(e)}")
+                print(f"⚠ {module_name} (OPTIONAL ERROR): {str(e)}")
+            return False
+
+    def test_torch_cuda(self, torch_module):
+        """Test PyTorch CUDA availability."""
+        if not torch_module.cuda.is_available():
+            raise RuntimeError("CUDA is not available in PyTorch")
+        print(f"  - CUDA devices: {torch_module.cuda.device_count()}")
+        print(f"  - CUDA version: {torch_module.version.cuda}")
+
+    def test_flash_attn_functionality(self, flash_attn_module):
+        """Test flash attention functionality."""
+        # Try to import key functions
+        from flash_attn import flash_attn_func, flash_attn_varlen_func
+        print("  - Flash attention functions imported successfully")
+
+    def test_vllm_functionality(self, vllm_module):
+        """Test vLLM basic functionality."""
+        from vllm import LLM, SamplingParams
+        print("  - vLLM core classes imported successfully")
+
+    def test_sglang_functionality(self, sglang_module):
+        """Test SGLang basic functionality."""
+        # Basic import test is sufficient for CI
+        import sgl_kernel
+        from sglang import launch_server
+        assert Version(get_version("sglang")) == Version("0.4.6.post4")
+        print("  - SGLang imported successfully")
+    
+    def test_transformers(self, transformers_module):
+        assert Version(get_version("transformers")) == Version("4.51.1")
+        print("  - transformers imported successfully")
+
+    def validate_critical_dependencies(self):
+        """Validate critical dependencies that must be present."""
+        print("\n=== Testing Critical Dependencies ===")
+        
+        # Core ML frameworks
+        self.test_import("torch", required=True, test_func=self.test_torch_cuda)
+        self.test_import("transformers", required=True, test_func=self.test_transformers)
+        
+        # Flash attention - critical for performance
+        self.test_import("flash_attn", required=True, test_func=self.test_flash_attn_functionality)
+        self.test_import("cugae", required=True)
+        # Inference engines
+        self.test_import("sglang", required=True, test_func=self.test_sglang_functionality)
+        
+        # Distributed computing
+        self.test_import("ray", required=True)
+        
+        # Scientific computing
+        self.test_import("numpy", required=True)
+        self.test_import("scipy", required=True)
+        
+        # Configuration management
+        self.test_import("hydra", required=True)
+        self.test_import("omegaconf", required=True)
+        
+        # Data processing
+        self.test_import("datasets", required=True)
+        self.test_import("pandas", required=True)
+        self.test_import("einops", required=True)
+        
+        # Monitoring and logging
+        self.test_import("wandb", required=True)
+        self.test_import("pynvml", required=True)
+        
+        # Networking
+        self.test_import("aiohttp", required=True)
+        self.test_import("fastapi", required=True)
+        self.test_import("uvicorn", required=True)
+        
+        # Math libraries (for evaluation)
+        self.test_import("sympy", required=True)
+        self.test_import("latex2sympy2", required=True)
+
+    def validate_optional_dependencies(self):
+        """Validate optional dependencies."""
+        print("\n=== Testing Optional Dependencies ===")
+        
+        # CUDA extensions (may not be available in all environments)
+        self.test_import("vllm", required=False, test_func=self.test_vllm_functionality)
+        self.test_import("grouped_gemm", required=False)
+        self.test_import("flashattn_hopper", required=False)
+        
+        # Optional utilities
+        self.test_import("tensorboardx", required=False)
+        self.test_import("swanlab", required=False)
+        self.test_import("matplotlib", required=False)
+        self.test_import("seaborn", required=False)
+        self.test_import("numba", required=False)
+        self.test_import("nltk", required=False)
+
+    def validate_cuda_extensions(self):
+        """Validate CUDA-specific functionality."""
+        print("\n=== Testing CUDA Extensions ===")
+        
+        try:
+            import torch
+            if torch.cuda.is_available():
+                # Test basic CUDA tensor operations
+                device = torch.device("cuda:0")
+                x = torch.randn(10, device=device)
+                y = torch.randn(10, device=device)
+                z = x + y
+                print("✓ Basic CUDA operations working")
+                
+                # Test flash attention if available
+                try:
+                    from flash_attn import flash_attn_func
+
+                    # Create small test tensors
+                    batch_size, seq_len, num_heads, head_dim = 1, 32, 4, 64
+                    q = torch.randn(batch_size, seq_len, num_heads, head_dim, 
+                                  device=device, dtype=torch.float16)
+                    k = torch.randn(batch_size, seq_len, num_heads, head_dim, 
+                                  device=device, dtype=torch.float16)
+                    v = torch.randn(batch_size, seq_len, num_heads, head_dim, 
+                                  device=device, dtype=torch.float16)
+                    
+                    # Test flash attention call
+                    out = flash_attn_func(q, k, v)
+                    print("✓ Flash attention CUDA operations working")
+                    
+                except Exception as e:
+                    print(f"⚠ Flash attention CUDA test failed: {e}")
+                    
+            else:
+                print("⚠ CUDA not available - skipping CUDA extension tests")
+                
+        except Exception as e:
+            print(f"✗ CUDA extension validation failed: {e}")
+
+    def run_validation(self):
+        """Run complete validation suite."""
+        print("AReaL Installation Validation")
+        print("=" * 50)
+        
+        self.validate_critical_dependencies()
+        self.validate_optional_dependencies()
+        self.validate_cuda_extensions()
+        
+        # Print summary
+        print("\n" + "=" * 50)
+        print("VALIDATION SUMMARY")
+        print("=" * 50)
+        
+        total_tests = len(self.results)
+        successful_tests = sum(1 for r in self.results.values() if r["status"] == "SUCCESS")
+        failed_tests = total_tests - successful_tests
+        
+        print(f"Total tests: {total_tests}")
+        print(f"Successful: {successful_tests}")
+        print(f"Failed: {failed_tests}")
+        
+        if self.critical_failures:
+            print(f"\n🚨 CRITICAL FAILURES ({len(self.critical_failures)}):")
+            for failure in self.critical_failures:
+                print(f"  - {failure}")
+                
+        if self.warnings:
+            print(f"\n⚠️  WARNINGS ({len(self.warnings)}):")
+            for warning in self.warnings:
+                print(f"  - {warning}")
+        
+        # Determine overall result
+        if self.critical_failures:
+            print(f"\n❌ INSTALLATION VALIDATION FAILED")
+            print("Please check the critical failures above and ensure all required")
+            print("dependencies are properly installed according to the installation guide.")
+            return False
+        else:
+            print(f"\n✅ INSTALLATION VALIDATION PASSED")
+            if self.warnings:
+                print("Note: Some optional dependencies failed but this won't affect")
+                print("core functionality.")
+            return True
+
+def main():
+    """Main entry point."""
+    validator = InstallationValidator()
+    success = validator.run_validation()
+    sys.exit(0 if success else 1)
+
+if __name__ == "__main__":
+    main()