Add CI for testing AReaLite (#150)

* ci: add test-arealite

* ci: add checkout before running test-arealite

* ci: add USERNAME

* ci: add test script

* ci: add GitHub mirror

* ci: fix typo

* ci: clone one commit

* ci: fix condition

* ci: set command timeout to 60m

* ci: enable pip cache

* ci: optimize container lifecycle

* ci: split into many stages

* ci(test-arealite): fix typo

* ci: fix wrong env

* ci: fix pytest

* ci: uninstall transformer-engine

* ci: uninstall transformer-engine

* ci: fix model paths

* ci: show stdout/stderr

* ci: fix not clean up

* ci: backup sglang

* ci: remove tmp repo dir when run

* ci: fix docker run exit 1 condition

* ci(test-arealite): limit the concurrency and extend command timeout
This commit is contained in:
Zijian Zhang 2025-07-07 09:36:12 +08:00 committed by GitHub
parent 89a8d8c46a
commit 078d3e1a44
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 142 additions and 6 deletions

50
.github/workflows/test-arealite.yml vendored Normal file
View File

@ -0,0 +1,50 @@
name: Test AReaLite
on:
push:
paths:
- .github/workflows/test-arealite.yml
- arealite/**
- ci/**
workflow_dispatch:
jobs:
test-arealite:
runs-on: ubuntu-latest
concurrency:
group: test-arealite
steps:
- uses: actions/checkout@v4
- uses: appleboy/ssh-action@v1
env:
GIT_REPO_URL: https://github.bibk.top/${{ github.repository }}
GIT_COMMIT_SHA: ${{ github.sha }}
with:
host: ${{ secrets.CI_NODE_ADDR }}
username: ${{ secrets.CI_NODE_USER }}
key: ${{ secrets.REMOTE_SSH_KEY }}
envs: GIT_REPO_URL,GIT_COMMIT_SHA
script_path: ci/clone_repo.sh
- uses: appleboy/ssh-action@v1
env:
GIT_COMMIT_SHA: ${{ github.sha }}
with:
host: ${{ secrets.CI_NODE_ADDR }}
username: ${{ secrets.CI_NODE_USER }}
key: ${{ secrets.REMOTE_SSH_KEY }}
command_timeout: 2h
envs: GIT_COMMIT_SHA
script_path: ci/build_env_image.sh
- uses: appleboy/ssh-action@v1
env:
GIT_COMMIT_SHA: ${{ github.sha }}
with:
host: ${{ secrets.CI_NODE_ADDR }}
username: ${{ secrets.CI_NODE_USER }}
key: ${{ secrets.REMOTE_SSH_KEY }}
command_timeout: 1h
envs: GIT_COMMIT_SHA
script_path: ci/test_arealite.sh

View File

@ -23,7 +23,7 @@ from arealite.utils import compute_varlen_position_indices
from realhf.impl.model.utils.padding import unpad_input
VOCAB_SIZE = 100
MODEL_PATH = "/storage/openpsi/models/Qwen__Qwen3-1.7B/"
MODEL_PATH = "Qwen/Qwen2-0.5B"
@pytest.fixture(scope="module")

View File

@ -22,7 +22,7 @@ from realhf.base import constants, name_resolve, seeding
EXPR_NAME = "test_grpo"
TRIAL_NAME = "test_grpo"
MODEL_PATH = "/storage/openpsi/models/Qwen__Qwen3-1.7B/"
MODEL_PATH = "Qwen/Qwen2-0.5B"
@pytest.fixture(scope="module")

View File

@ -27,7 +27,7 @@ from realhf.base import name_resolve, seeding
EXPR_NAME = "test_rollout"
TRIAL_NAME = "test_rollout"
MODEL_PATH = "/storage/openpsi/models/Qwen__Qwen3-1.7B/"
MODEL_PATH = "Qwen/Qwen2-0.5B"
@pytest.fixture(scope="module")

View File

@ -22,7 +22,7 @@ from realhf.base import name_resolve, names, seeding
EXPR_NAME = "test_rollout_controller"
TRIAL_NAME = "test_rollout_controller"
MODEL_PATH = "/storage/openpsi/models/Qwen__Qwen3-1.7B/"
MODEL_PATH = "Qwen/Qwen2-0.5B"
@pytest.fixture(scope="module")

View File

@ -60,7 +60,7 @@ def test_sft():
)
engine_config = EngineConfig(
path="/storage/openpsi/models/Qwen__Qwen3-1.7B/",
path="Qwen/Qwen2-0.5B",
gradient_checkpointing=False,
optimizer=OptimizerConfig(),
backend=EngineBackendConfig(type="hf"),

View File

@ -23,7 +23,7 @@ from realhf.base import name_resolve, seeding
EXPR_NAME = "test_sglang_client"
TRIAL_NAME = "test_sglang_client"
MODEL_PATH = "/storage/openpsi/models/Qwen__Qwen3-1.7B/"
MODEL_PATH = "Qwen/Qwen2-0.5B"
@pytest.fixture(scope="module")

39
ci/build_env_image.sh Normal file
View File

@ -0,0 +1,39 @@
#!/usr/bin/env bash
set -e
GIT_COMMIT_SHA=${GIT_COMMIT_SHA:?"GIT_COMMIT_SHA is not set"}
echo "GIT_COMMIT_SHA: $GIT_COMMIT_SHA"
# If there is already an image named areal-env, skip.
if docker images --format '{{.Repository}}:{{.Tag}}' | grep -q 'areal-env:latest'; then
echo "Image areal-env already exists, skipping build."
exit 0
fi
RUN_ID="areal-$GIT_COMMIT_SHA"
cd "/tmp/$RUN_ID"
if docker ps -a --format '{{.Names}}' | grep -q "$RUN_ID"; then
docker rm -f $RUN_ID
fi
docker run \
--name $RUN_ID \
--gpus all \
--shm-size=8g \
-v $(pwd):/workspace \
-w /workspace \
nvcr.io/nvidia/pytorch:25.01-py3 \
bash -c "
python -m pip install --upgrade pip
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
pip config unset global.extra-index-url
bash examples/env/scripts/setup-pip-deps.sh
pip uninstall -y transformer-engine
mv ./sglang /sglang
" || { docker rm -f $RUN_ID; exit 1; }
docker commit $RUN_ID areal-env:latest
docker rm -f $RUN_ID

19
ci/clone_repo.sh Normal file
View File

@ -0,0 +1,19 @@
#!/usr/bin/env bash
set -e
GIT_REPO_URL=${GIT_REPO_URL:?"GIT_REPO_URL is not set"}
GIT_COMMIT_SHA=${GIT_COMMIT_SHA:?"GIT_COMMIT_SHA is not set"}
echo "GIT_REPO_URL: $GIT_REPO_URL"
echo "GIT_COMMIT_SHA: $GIT_COMMIT_SHA"
RUN_ID="areal-$GIT_COMMIT_SHA"
rm -rf "/tmp/$RUN_ID"
mkdir -p "/tmp/$RUN_ID"
cd "/tmp/$RUN_ID"
git init
git remote add origin "$GIT_REPO_URL"
git fetch --depth 1 origin "$GIT_COMMIT_SHA"
git checkout FETCH_HEAD

28
ci/test_arealite.sh Normal file
View File

@ -0,0 +1,28 @@
#!/usr/bin/env bash
set -e
GIT_COMMIT_SHA=${GIT_COMMIT_SHA:?"GIT_COMMIT_SHA is not set"}
echo "GIT_COMMIT_SHA: $GIT_COMMIT_SHA"
RUN_ID="areal-$GIT_COMMIT_SHA"
cd "/tmp/$RUN_ID"
if docker ps -a --format '{{.Names}}' | grep -q "$RUN_ID"; then
docker rm -f $RUN_ID
fi
docker run \
--name $RUN_ID \
--gpus all \
--shm-size=8g \
-v $(pwd):/workspace \
-w /workspace \
areal-env:latest \
bash -c "
mv /sglang ./sglang
HF_ENDPOINT=https://hf-mirror.com python -m pytest -s arealite/
" || { docker rm -f $RUN_ID; exit 1; }
docker rm -f $RUN_ID