[FIX] Fix the flash-attn version to 2.7.3 (#114)

* .

* degenerate to 2.7.3

* .

* .

* update ci

* update ci

* update ci

* update CI

* update to tsinghua mirror

* update ci

* .

* .

* .
This commit is contained in:
Wei Fu 2025-06-23 15:46:17 +08:00 committed by GitHub
parent 1ec1399f19
commit 3642cce2fc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 373 additions and 2 deletions

View File

@ -0,0 +1,128 @@
name: Installation Validation
on:
push:
branches: [ none ]
paths:
- 'examples/env/scripts/setup-pip-deps.sh'
- 'docs/tutorial/installation.md'
- 'examples/env/validate_installation.py'
- 'setup.py'
- 'requirements*.txt'
- '.github/workflows/installation-validation.yml'
pull_request:
branches: [ none ]
paths:
- 'examples/env/scripts/setup-pip-deps.sh'
- 'docs/tutorial/installation.md'
- 'examples/env/validate_installation.py'
- 'setup.py'
- 'requirements*.txt'
- '.github/workflows/installation-validation.yml'
workflow_dispatch:
jobs:
validate-installation:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
lfs: true
- name: Set up SSH key
run: |
mkdir -p ~/.ssh
echo "${{ secrets.REMOTE_SSH_KEY }}" > ~/.ssh/id_rsa
chmod 600 ~/.ssh/id_rsa
ssh-keyscan -p 8107 101.6.96.205 >> ~/.ssh/known_hosts
- name: Synchronize repository to remote machine
run: |
# Use rsync to synchronize repository to remote machine
rsync -avz --delete \
--exclude='.git' \
--exclude='__pycache__' \
--exclude='*.pyc' \
--exclude='*.pyo' \
--exclude='*.egg-info' \
--exclude='build/' \
--exclude='dist/' \
--exclude='.pytest_cache' \
--exclude='.coverage' \
--exclude='*.so' \
--exclude='*.dylib' \
--exclude='node_modules/' \
--exclude='.env' \
--exclude='.venv' \
-e 'ssh -p 8107' . fuwei@101.6.96.205:/tmp/areal-validation/
- name: Run installation validation on remote machine
run: |
ssh -p 8107 fuwei@101.6.96.205 << 'EOF'
set -e
# Navigate to the synchronized repository
cd /tmp/areal-validation
# Create persistent pip cache directory
mkdir -p /tmp/pip-cache
# Generate a unique container name
CONTAINER_NAME="areal-validation-$(date +%s)"
# Stop and remove any existing container with the same name
docker stop $CONTAINER_NAME 2>/dev/null || true
docker rm $CONTAINER_NAME 2>/dev/null || true
echo "=== Starting Docker container ==="
# Launch Docker container with NVIDIA PyTorch image
docker run -d \
--name $CONTAINER_NAME \
--gpus all \
--shm-size=8g \
-v $(pwd):/workspace \
-v /tmp/pip-cache:/root/.cache/pip \
-w /workspace \
nvcr.io/nvidia/pytorch:25.01-py3 \
sleep infinity
echo "=== Verifying CUDA environment in container ==="
docker exec $CONTAINER_NAME nvidia-smi
docker exec $CONTAINER_NAME nvcc --version
echo "=== Verifying workspace contents ==="
docker exec $CONTAINER_NAME pwd
docker exec $CONTAINER_NAME ls -la /workspace
docker exec $CONTAINER_NAME ls -la /workspace/examples/env/ || echo "examples/env directory not found"
echo "=== Checking pip cache before installation ==="
du -sh /tmp/pip-cache 2>/dev/null || echo "Cache directory empty"
echo "=== Installing dependencies ==="
docker exec $CONTAINER_NAME bash -c "
python -m pip install --upgrade pip
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
pip config unset global.extra-index-url
# Run the installation script
bash examples/env/scripts/setup-pip-deps.sh
python examples/env/validate_installation.py
"
echo "=== Checking pip cache after installation ==="
du -sh /tmp/pip-cache 2>/dev/null || echo "Cache directory still empty"
echo "=== Installation validation completed successfully ==="
# Cleanup
docker stop $CONTAINER_NAME
docker rm $CONTAINER_NAME
cd ~
rm -rf /tmp/areal-validation
EOF
- name: Cleanup SSH key
if: always()
run: |
rm -f ~/.ssh/id_rsa

View File

@ -1,11 +1,12 @@
#!/bin/bash
# basic dependencies
pip install -U pip
pip uninstall deepspeed flash-attn pynvml cugraph-dgl dask-cuda cugraph-service-server raft-dask cugraph cuml cugraph-pyg -y
pip uninstall torch deepspeed flash-attn pynvml cugraph-dgl dask-cuda cugraph-service-server raft-dask cugraph cuml cugraph-pyg -y
pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0
pip install "sglang[all]==0.4.6.post4"
pip install megatron-core==0.11.0 nvidia-ml-py
pip install git+https://github.com/garrett4wade/cugae --no-build-isolation --verbose
pip install flash-attn --no-build-isolation
pip install "flash-attn<=2.7.3" --no-build-isolation
# Package used for calculating math reward
pip install -e evaluation/latex2sympy

242
examples/env/validate_installation.py vendored Normal file
View File

@ -0,0 +1,242 @@
#!/usr/bin/env python3
"""
Installation Validation Script for AReaL
This script validates that all critical dependencies are properly installed
and can be imported successfully. It's designed to be run in CI to validate
the installation procedure described in docs/tutorial/installation.md.
"""
import importlib
import sys
import traceback
import warnings
from importlib.metadata import version as get_version
from typing import Any, Dict, List, Optional
from packaging.version import Version
class InstallationValidator:
def __init__(self):
self.results = {}
self.critical_failures = []
self.warnings = []
def test_import(self, module_name: str, required: bool = True,
test_func: Optional[callable] = None) -> bool:
"""Test importing a module and optionally run additional tests."""
try:
module = importlib.import_module(module_name)
# Run additional test if provided
if test_func:
test_func(module)
self.results[module_name] = {"status": "SUCCESS", "error": None}
print(f"{module_name}")
return True
except ImportError as e:
self.results[module_name] = {"status": "FAILED", "error": str(e)}
if required:
self.critical_failures.append(f"{module_name}: {str(e)}")
print(f"{module_name} (CRITICAL): {str(e)}")
else:
self.warnings.append(f"{module_name}: {str(e)}")
print(f"{module_name} (OPTIONAL): {str(e)}")
return False
except Exception as e:
self.results[module_name] = {"status": "ERROR", "error": str(e)}
if required:
self.critical_failures.append(f"{module_name}: {str(e)}")
print(f"{module_name} (CRITICAL ERROR): {str(e)}")
else:
self.warnings.append(f"{module_name}: {str(e)}")
print(f"{module_name} (OPTIONAL ERROR): {str(e)}")
return False
def test_torch_cuda(self, torch_module):
"""Test PyTorch CUDA availability."""
if not torch_module.cuda.is_available():
raise RuntimeError("CUDA is not available in PyTorch")
print(f" - CUDA devices: {torch_module.cuda.device_count()}")
print(f" - CUDA version: {torch_module.version.cuda}")
def test_flash_attn_functionality(self, flash_attn_module):
"""Test flash attention functionality."""
# Try to import key functions
from flash_attn import flash_attn_func, flash_attn_varlen_func
print(" - Flash attention functions imported successfully")
def test_vllm_functionality(self, vllm_module):
"""Test vLLM basic functionality."""
from vllm import LLM, SamplingParams
print(" - vLLM core classes imported successfully")
def test_sglang_functionality(self, sglang_module):
"""Test SGLang basic functionality."""
# Basic import test is sufficient for CI
import sgl_kernel
from sglang import launch_server
assert Version(get_version("sglang")) == Version("0.4.6.post4")
print(" - SGLang imported successfully")
def test_transformers(self, transformers_module):
assert Version(get_version("transformers")) == Version("4.51.1")
print(" - transformers imported successfully")
def validate_critical_dependencies(self):
"""Validate critical dependencies that must be present."""
print("\n=== Testing Critical Dependencies ===")
# Core ML frameworks
self.test_import("torch", required=True, test_func=self.test_torch_cuda)
self.test_import("transformers", required=True, test_func=self.test_transformers)
# Flash attention - critical for performance
self.test_import("flash_attn", required=True, test_func=self.test_flash_attn_functionality)
self.test_import("cugae", required=True)
# Inference engines
self.test_import("sglang", required=True, test_func=self.test_sglang_functionality)
# Distributed computing
self.test_import("ray", required=True)
# Scientific computing
self.test_import("numpy", required=True)
self.test_import("scipy", required=True)
# Configuration management
self.test_import("hydra", required=True)
self.test_import("omegaconf", required=True)
# Data processing
self.test_import("datasets", required=True)
self.test_import("pandas", required=True)
self.test_import("einops", required=True)
# Monitoring and logging
self.test_import("wandb", required=True)
self.test_import("pynvml", required=True)
# Networking
self.test_import("aiohttp", required=True)
self.test_import("fastapi", required=True)
self.test_import("uvicorn", required=True)
# Math libraries (for evaluation)
self.test_import("sympy", required=True)
self.test_import("latex2sympy2", required=True)
def validate_optional_dependencies(self):
"""Validate optional dependencies."""
print("\n=== Testing Optional Dependencies ===")
# CUDA extensions (may not be available in all environments)
self.test_import("vllm", required=False, test_func=self.test_vllm_functionality)
self.test_import("grouped_gemm", required=False)
self.test_import("flashattn_hopper", required=False)
# Optional utilities
self.test_import("tensorboardx", required=False)
self.test_import("swanlab", required=False)
self.test_import("matplotlib", required=False)
self.test_import("seaborn", required=False)
self.test_import("numba", required=False)
self.test_import("nltk", required=False)
def validate_cuda_extensions(self):
"""Validate CUDA-specific functionality."""
print("\n=== Testing CUDA Extensions ===")
try:
import torch
if torch.cuda.is_available():
# Test basic CUDA tensor operations
device = torch.device("cuda:0")
x = torch.randn(10, device=device)
y = torch.randn(10, device=device)
z = x + y
print("✓ Basic CUDA operations working")
# Test flash attention if available
try:
from flash_attn import flash_attn_func
# Create small test tensors
batch_size, seq_len, num_heads, head_dim = 1, 32, 4, 64
q = torch.randn(batch_size, seq_len, num_heads, head_dim,
device=device, dtype=torch.float16)
k = torch.randn(batch_size, seq_len, num_heads, head_dim,
device=device, dtype=torch.float16)
v = torch.randn(batch_size, seq_len, num_heads, head_dim,
device=device, dtype=torch.float16)
# Test flash attention call
out = flash_attn_func(q, k, v)
print("✓ Flash attention CUDA operations working")
except Exception as e:
print(f"⚠ Flash attention CUDA test failed: {e}")
else:
print("⚠ CUDA not available - skipping CUDA extension tests")
except Exception as e:
print(f"✗ CUDA extension validation failed: {e}")
def run_validation(self):
"""Run complete validation suite."""
print("AReaL Installation Validation")
print("=" * 50)
self.validate_critical_dependencies()
self.validate_optional_dependencies()
self.validate_cuda_extensions()
# Print summary
print("\n" + "=" * 50)
print("VALIDATION SUMMARY")
print("=" * 50)
total_tests = len(self.results)
successful_tests = sum(1 for r in self.results.values() if r["status"] == "SUCCESS")
failed_tests = total_tests - successful_tests
print(f"Total tests: {total_tests}")
print(f"Successful: {successful_tests}")
print(f"Failed: {failed_tests}")
if self.critical_failures:
print(f"\n🚨 CRITICAL FAILURES ({len(self.critical_failures)}):")
for failure in self.critical_failures:
print(f" - {failure}")
if self.warnings:
print(f"\n⚠️ WARNINGS ({len(self.warnings)}):")
for warning in self.warnings:
print(f" - {warning}")
# Determine overall result
if self.critical_failures:
print(f"\n❌ INSTALLATION VALIDATION FAILED")
print("Please check the critical failures above and ensure all required")
print("dependencies are properly installed according to the installation guide.")
return False
else:
print(f"\n✅ INSTALLATION VALIDATION PASSED")
if self.warnings:
print("Note: Some optional dependencies failed but this won't affect")
print("core functionality.")
return True
def main():
"""Main entry point."""
validator = InstallationValidator()
success = validator.run_validation()
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()