mirror of https://github.com/inclusionAI/AReaL
[FIX] Fix the flash-attn version to 2.7.3 (#114)
* . * degenerate to 2.7.3 * . * . * update ci * update ci * update ci * update CI * update to tsinghua mirror * update ci * . * . * .
This commit is contained in:
parent
1ec1399f19
commit
3642cce2fc
|
@ -0,0 +1,128 @@
|
|||
name: Installation Validation
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ none ]
|
||||
paths:
|
||||
- 'examples/env/scripts/setup-pip-deps.sh'
|
||||
- 'docs/tutorial/installation.md'
|
||||
- 'examples/env/validate_installation.py'
|
||||
- 'setup.py'
|
||||
- 'requirements*.txt'
|
||||
- '.github/workflows/installation-validation.yml'
|
||||
pull_request:
|
||||
branches: [ none ]
|
||||
paths:
|
||||
- 'examples/env/scripts/setup-pip-deps.sh'
|
||||
- 'docs/tutorial/installation.md'
|
||||
- 'examples/env/validate_installation.py'
|
||||
- 'setup.py'
|
||||
- 'requirements*.txt'
|
||||
- '.github/workflows/installation-validation.yml'
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
validate-installation:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
lfs: true
|
||||
|
||||
- name: Set up SSH key
|
||||
run: |
|
||||
mkdir -p ~/.ssh
|
||||
echo "${{ secrets.REMOTE_SSH_KEY }}" > ~/.ssh/id_rsa
|
||||
chmod 600 ~/.ssh/id_rsa
|
||||
ssh-keyscan -p 8107 101.6.96.205 >> ~/.ssh/known_hosts
|
||||
|
||||
- name: Synchronize repository to remote machine
|
||||
run: |
|
||||
# Use rsync to synchronize repository to remote machine
|
||||
rsync -avz --delete \
|
||||
--exclude='.git' \
|
||||
--exclude='__pycache__' \
|
||||
--exclude='*.pyc' \
|
||||
--exclude='*.pyo' \
|
||||
--exclude='*.egg-info' \
|
||||
--exclude='build/' \
|
||||
--exclude='dist/' \
|
||||
--exclude='.pytest_cache' \
|
||||
--exclude='.coverage' \
|
||||
--exclude='*.so' \
|
||||
--exclude='*.dylib' \
|
||||
--exclude='node_modules/' \
|
||||
--exclude='.env' \
|
||||
--exclude='.venv' \
|
||||
-e 'ssh -p 8107' . fuwei@101.6.96.205:/tmp/areal-validation/
|
||||
|
||||
- name: Run installation validation on remote machine
|
||||
run: |
|
||||
ssh -p 8107 fuwei@101.6.96.205 << 'EOF'
|
||||
set -e
|
||||
|
||||
# Navigate to the synchronized repository
|
||||
cd /tmp/areal-validation
|
||||
|
||||
# Create persistent pip cache directory
|
||||
mkdir -p /tmp/pip-cache
|
||||
|
||||
# Generate a unique container name
|
||||
CONTAINER_NAME="areal-validation-$(date +%s)"
|
||||
|
||||
# Stop and remove any existing container with the same name
|
||||
docker stop $CONTAINER_NAME 2>/dev/null || true
|
||||
docker rm $CONTAINER_NAME 2>/dev/null || true
|
||||
|
||||
echo "=== Starting Docker container ==="
|
||||
# Launch Docker container with NVIDIA PyTorch image
|
||||
docker run -d \
|
||||
--name $CONTAINER_NAME \
|
||||
--gpus all \
|
||||
--shm-size=8g \
|
||||
-v $(pwd):/workspace \
|
||||
-v /tmp/pip-cache:/root/.cache/pip \
|
||||
-w /workspace \
|
||||
nvcr.io/nvidia/pytorch:25.01-py3 \
|
||||
sleep infinity
|
||||
|
||||
echo "=== Verifying CUDA environment in container ==="
|
||||
docker exec $CONTAINER_NAME nvidia-smi
|
||||
docker exec $CONTAINER_NAME nvcc --version
|
||||
|
||||
echo "=== Verifying workspace contents ==="
|
||||
docker exec $CONTAINER_NAME pwd
|
||||
docker exec $CONTAINER_NAME ls -la /workspace
|
||||
docker exec $CONTAINER_NAME ls -la /workspace/examples/env/ || echo "examples/env directory not found"
|
||||
|
||||
echo "=== Checking pip cache before installation ==="
|
||||
du -sh /tmp/pip-cache 2>/dev/null || echo "Cache directory empty"
|
||||
|
||||
echo "=== Installing dependencies ==="
|
||||
docker exec $CONTAINER_NAME bash -c "
|
||||
python -m pip install --upgrade pip
|
||||
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
pip config unset global.extra-index-url
|
||||
# Run the installation script
|
||||
bash examples/env/scripts/setup-pip-deps.sh
|
||||
python examples/env/validate_installation.py
|
||||
"
|
||||
|
||||
echo "=== Checking pip cache after installation ==="
|
||||
du -sh /tmp/pip-cache 2>/dev/null || echo "Cache directory still empty"
|
||||
|
||||
echo "=== Installation validation completed successfully ==="
|
||||
|
||||
# Cleanup
|
||||
docker stop $CONTAINER_NAME
|
||||
docker rm $CONTAINER_NAME
|
||||
cd ~
|
||||
rm -rf /tmp/areal-validation
|
||||
EOF
|
||||
|
||||
- name: Cleanup SSH key
|
||||
if: always()
|
||||
run: |
|
||||
rm -f ~/.ssh/id_rsa
|
|
@ -1,11 +1,12 @@
|
|||
#!/bin/bash
|
||||
# basic dependencies
|
||||
pip install -U pip
|
||||
pip uninstall deepspeed flash-attn pynvml cugraph-dgl dask-cuda cugraph-service-server raft-dask cugraph cuml cugraph-pyg -y
|
||||
pip uninstall torch deepspeed flash-attn pynvml cugraph-dgl dask-cuda cugraph-service-server raft-dask cugraph cuml cugraph-pyg -y
|
||||
pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0
|
||||
pip install "sglang[all]==0.4.6.post4"
|
||||
pip install megatron-core==0.11.0 nvidia-ml-py
|
||||
pip install git+https://github.com/garrett4wade/cugae --no-build-isolation --verbose
|
||||
pip install flash-attn --no-build-isolation
|
||||
pip install "flash-attn<=2.7.3" --no-build-isolation
|
||||
|
||||
# Package used for calculating math reward
|
||||
pip install -e evaluation/latex2sympy
|
||||
|
|
|
@ -0,0 +1,242 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Installation Validation Script for AReaL
|
||||
|
||||
This script validates that all critical dependencies are properly installed
|
||||
and can be imported successfully. It's designed to be run in CI to validate
|
||||
the installation procedure described in docs/tutorial/installation.md.
|
||||
"""
|
||||
|
||||
import importlib
|
||||
import sys
|
||||
import traceback
|
||||
import warnings
|
||||
from importlib.metadata import version as get_version
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from packaging.version import Version
|
||||
|
||||
|
||||
class InstallationValidator:
|
||||
def __init__(self):
|
||||
self.results = {}
|
||||
self.critical_failures = []
|
||||
self.warnings = []
|
||||
|
||||
def test_import(self, module_name: str, required: bool = True,
|
||||
test_func: Optional[callable] = None) -> bool:
|
||||
"""Test importing a module and optionally run additional tests."""
|
||||
try:
|
||||
module = importlib.import_module(module_name)
|
||||
|
||||
# Run additional test if provided
|
||||
if test_func:
|
||||
test_func(module)
|
||||
|
||||
self.results[module_name] = {"status": "SUCCESS", "error": None}
|
||||
print(f"✓ {module_name}")
|
||||
return True
|
||||
|
||||
except ImportError as e:
|
||||
self.results[module_name] = {"status": "FAILED", "error": str(e)}
|
||||
if required:
|
||||
self.critical_failures.append(f"{module_name}: {str(e)}")
|
||||
print(f"✗ {module_name} (CRITICAL): {str(e)}")
|
||||
else:
|
||||
self.warnings.append(f"{module_name}: {str(e)}")
|
||||
print(f"⚠ {module_name} (OPTIONAL): {str(e)}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.results[module_name] = {"status": "ERROR", "error": str(e)}
|
||||
if required:
|
||||
self.critical_failures.append(f"{module_name}: {str(e)}")
|
||||
print(f"✗ {module_name} (CRITICAL ERROR): {str(e)}")
|
||||
else:
|
||||
self.warnings.append(f"{module_name}: {str(e)}")
|
||||
print(f"⚠ {module_name} (OPTIONAL ERROR): {str(e)}")
|
||||
return False
|
||||
|
||||
def test_torch_cuda(self, torch_module):
|
||||
"""Test PyTorch CUDA availability."""
|
||||
if not torch_module.cuda.is_available():
|
||||
raise RuntimeError("CUDA is not available in PyTorch")
|
||||
print(f" - CUDA devices: {torch_module.cuda.device_count()}")
|
||||
print(f" - CUDA version: {torch_module.version.cuda}")
|
||||
|
||||
def test_flash_attn_functionality(self, flash_attn_module):
|
||||
"""Test flash attention functionality."""
|
||||
# Try to import key functions
|
||||
from flash_attn import flash_attn_func, flash_attn_varlen_func
|
||||
print(" - Flash attention functions imported successfully")
|
||||
|
||||
def test_vllm_functionality(self, vllm_module):
|
||||
"""Test vLLM basic functionality."""
|
||||
from vllm import LLM, SamplingParams
|
||||
print(" - vLLM core classes imported successfully")
|
||||
|
||||
def test_sglang_functionality(self, sglang_module):
|
||||
"""Test SGLang basic functionality."""
|
||||
# Basic import test is sufficient for CI
|
||||
import sgl_kernel
|
||||
from sglang import launch_server
|
||||
assert Version(get_version("sglang")) == Version("0.4.6.post4")
|
||||
print(" - SGLang imported successfully")
|
||||
|
||||
def test_transformers(self, transformers_module):
|
||||
assert Version(get_version("transformers")) == Version("4.51.1")
|
||||
print(" - transformers imported successfully")
|
||||
|
||||
def validate_critical_dependencies(self):
|
||||
"""Validate critical dependencies that must be present."""
|
||||
print("\n=== Testing Critical Dependencies ===")
|
||||
|
||||
# Core ML frameworks
|
||||
self.test_import("torch", required=True, test_func=self.test_torch_cuda)
|
||||
self.test_import("transformers", required=True, test_func=self.test_transformers)
|
||||
|
||||
# Flash attention - critical for performance
|
||||
self.test_import("flash_attn", required=True, test_func=self.test_flash_attn_functionality)
|
||||
self.test_import("cugae", required=True)
|
||||
# Inference engines
|
||||
self.test_import("sglang", required=True, test_func=self.test_sglang_functionality)
|
||||
|
||||
# Distributed computing
|
||||
self.test_import("ray", required=True)
|
||||
|
||||
# Scientific computing
|
||||
self.test_import("numpy", required=True)
|
||||
self.test_import("scipy", required=True)
|
||||
|
||||
# Configuration management
|
||||
self.test_import("hydra", required=True)
|
||||
self.test_import("omegaconf", required=True)
|
||||
|
||||
# Data processing
|
||||
self.test_import("datasets", required=True)
|
||||
self.test_import("pandas", required=True)
|
||||
self.test_import("einops", required=True)
|
||||
|
||||
# Monitoring and logging
|
||||
self.test_import("wandb", required=True)
|
||||
self.test_import("pynvml", required=True)
|
||||
|
||||
# Networking
|
||||
self.test_import("aiohttp", required=True)
|
||||
self.test_import("fastapi", required=True)
|
||||
self.test_import("uvicorn", required=True)
|
||||
|
||||
# Math libraries (for evaluation)
|
||||
self.test_import("sympy", required=True)
|
||||
self.test_import("latex2sympy2", required=True)
|
||||
|
||||
def validate_optional_dependencies(self):
|
||||
"""Validate optional dependencies."""
|
||||
print("\n=== Testing Optional Dependencies ===")
|
||||
|
||||
# CUDA extensions (may not be available in all environments)
|
||||
self.test_import("vllm", required=False, test_func=self.test_vllm_functionality)
|
||||
self.test_import("grouped_gemm", required=False)
|
||||
self.test_import("flashattn_hopper", required=False)
|
||||
|
||||
# Optional utilities
|
||||
self.test_import("tensorboardx", required=False)
|
||||
self.test_import("swanlab", required=False)
|
||||
self.test_import("matplotlib", required=False)
|
||||
self.test_import("seaborn", required=False)
|
||||
self.test_import("numba", required=False)
|
||||
self.test_import("nltk", required=False)
|
||||
|
||||
def validate_cuda_extensions(self):
|
||||
"""Validate CUDA-specific functionality."""
|
||||
print("\n=== Testing CUDA Extensions ===")
|
||||
|
||||
try:
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
# Test basic CUDA tensor operations
|
||||
device = torch.device("cuda:0")
|
||||
x = torch.randn(10, device=device)
|
||||
y = torch.randn(10, device=device)
|
||||
z = x + y
|
||||
print("✓ Basic CUDA operations working")
|
||||
|
||||
# Test flash attention if available
|
||||
try:
|
||||
from flash_attn import flash_attn_func
|
||||
|
||||
# Create small test tensors
|
||||
batch_size, seq_len, num_heads, head_dim = 1, 32, 4, 64
|
||||
q = torch.randn(batch_size, seq_len, num_heads, head_dim,
|
||||
device=device, dtype=torch.float16)
|
||||
k = torch.randn(batch_size, seq_len, num_heads, head_dim,
|
||||
device=device, dtype=torch.float16)
|
||||
v = torch.randn(batch_size, seq_len, num_heads, head_dim,
|
||||
device=device, dtype=torch.float16)
|
||||
|
||||
# Test flash attention call
|
||||
out = flash_attn_func(q, k, v)
|
||||
print("✓ Flash attention CUDA operations working")
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠ Flash attention CUDA test failed: {e}")
|
||||
|
||||
else:
|
||||
print("⚠ CUDA not available - skipping CUDA extension tests")
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ CUDA extension validation failed: {e}")
|
||||
|
||||
def run_validation(self):
|
||||
"""Run complete validation suite."""
|
||||
print("AReaL Installation Validation")
|
||||
print("=" * 50)
|
||||
|
||||
self.validate_critical_dependencies()
|
||||
self.validate_optional_dependencies()
|
||||
self.validate_cuda_extensions()
|
||||
|
||||
# Print summary
|
||||
print("\n" + "=" * 50)
|
||||
print("VALIDATION SUMMARY")
|
||||
print("=" * 50)
|
||||
|
||||
total_tests = len(self.results)
|
||||
successful_tests = sum(1 for r in self.results.values() if r["status"] == "SUCCESS")
|
||||
failed_tests = total_tests - successful_tests
|
||||
|
||||
print(f"Total tests: {total_tests}")
|
||||
print(f"Successful: {successful_tests}")
|
||||
print(f"Failed: {failed_tests}")
|
||||
|
||||
if self.critical_failures:
|
||||
print(f"\n🚨 CRITICAL FAILURES ({len(self.critical_failures)}):")
|
||||
for failure in self.critical_failures:
|
||||
print(f" - {failure}")
|
||||
|
||||
if self.warnings:
|
||||
print(f"\n⚠️ WARNINGS ({len(self.warnings)}):")
|
||||
for warning in self.warnings:
|
||||
print(f" - {warning}")
|
||||
|
||||
# Determine overall result
|
||||
if self.critical_failures:
|
||||
print(f"\n❌ INSTALLATION VALIDATION FAILED")
|
||||
print("Please check the critical failures above and ensure all required")
|
||||
print("dependencies are properly installed according to the installation guide.")
|
||||
return False
|
||||
else:
|
||||
print(f"\n✅ INSTALLATION VALIDATION PASSED")
|
||||
if self.warnings:
|
||||
print("Note: Some optional dependencies failed but this won't affect")
|
||||
print("core functionality.")
|
||||
return True
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
validator = InstallationValidator()
|
||||
success = validator.run_validation()
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue