[Doc] Fix documentation for using Docker containers and customized agents (#64)

* test env setup * . * fix a missing cherry-pick * . * . * . * update docker instrcution * fix
2025-06-01 16:33:29 +08:00 · 2025-06-01 16:33:29 +08:00 · ce4d7354bf
parent afe5a2c880
commit ce4d7354bf
9 changed files with 38 additions and 38 deletions
--- a/docs/_toc.yml
+++ b/docs/_toc.yml
@ -9,8 +9,10 @@ parts:
    - file: tutorial/installation
    - file: tutorial/quickstart
    - file: tutorial/eval
-    - file: tutorial/agent
    - file: tutorial/troubleshooting
+  - caption: Customization
+    chapters:
+    - file: customization/agent
  - caption: Developer Manual
    chapters:
    - file: developer/exp_launch
--- a/docs/customization/agent.md
+++ b/docs/customization/agent.md
@ -8,11 +8,6 @@ Create a new file under `realhf/impl/agent/`, for example, `math_multi_turn_agen

 ```python
 class MathMultiTurnAgent(Agent):
-    def __init__(
-        self,
-        ... # Any required configurations here
-    ):
-        ...
    
    async def collect_trajectory(
        self,
@ -20,47 +15,40 @@ class MathMultiTurnAgent(Agent):
        env: EnvironmentService,
        obs_queue: asyncio.Queue,
        act_queue: asyncio.Queue,
-    ) -> List[SequenceSample]:
+    ):
        ...
 ```

 ## Implement the `collect_trajectory` Logic

-The `collect_trajectory` function takes a task prompt, an environment, and two queues as input, then produces several trajectories for the RL trainer. Within this function, you can create arbitrary data processing logic to produce the input prompt for the inference engine and extract the action from the generated tokens.
+The `collect_trajectory` function takes a task prompt, an environment, and two queues as input, then produces several trajectories for the RL trainer. Within this function, you can create arbitrary data processing logic to produce the input for the inference engine (i.e., via `obs_queue`) and extract the action (i.e., via `act_queue`) from the generated tokens.

-In this example, the initial observation is the math problem itself, which is already included in the `prompt` parameter. We put the token IDs and generation config into `obs_queue` and wait for the action produced by the inference engine from `act_queue`. After the inference engine returns, we extract the generated answers and send them to the environment.
+In this example, the initial observation is the math problem itself. We put the token IDs and generation config into `obs_queue` and wait for the action produced by the inference engine from `act_queue`. After the inference engine returns, we extract the generated answers and send them to the environment.

 ```python
 for turn in range(self.num_turns):
    await obs_queue.put((qid, token_ids, self.gconfig))
    act: BundledGenerationOutputs = await act_queue.get()
-    ...
    _, success, *_ = await env.step((qid, answers))
+    ...
 ```

 The environment is similar to a [gym environment](https://github.com/Farama-Foundation/Gymnasium), which defines two methods: `reset` and `step`. However, to maintain efficiency, we use an asynchronous implementation to avoid mutual blocking across different environment instances.

-Although the environment can be quite complex (e.g., for a SWE-agent), the implementation in this example is straightforward. The math environment is single-step and essentially serves as a wrapper around the reward function:
+The math environment is stateless and essentially serves as a wrapper around the reward function:

 ```python
 class MathCodeSingleStepEnv(EnvironmentService):

-    async def reset(self, seed=None, options=None):
-        return None, {}
-
    async def step(self, action: Tuple[str, List[str]]):
        qid, answers = action
-        group_size = len(answers)
-        qid = qid.split("@")[0]
-        cur_task = self.id2info[qid]["task"]
-
+        ...
+        # Make `math_verify_call` async
        format_rewards = await asyncio.to_thread(
            math_verify_call,
-            self.id2info,
            answers,
-            [qid for _ in range(group_size)],
+            ...
        )
-
        return None, format_rewards, True, False, {}
 ```

@ -86,7 +74,7 @@ for turn in range(self.num_turns):

 ## Modify the Configuration

-Finally, arrange all the data in the proper format and return it. You're now close to running the end-to-end RL loop. The final step is to register and import your implementation, then modify the experiment configuration.
+You're now close to running the end-to-end RL loop. The final step is to register and import your implementation, then modify the experiment configuration.

 ```python
 # in realhf/impl/agent/math_multi_turn_agent.py
@ -98,9 +86,14 @@ register_agent("math-multi-turn", MathMultiTurnAgent)
 import realhf.impl.agent.math_multi_turn_agent
 ```

+In `realhf/experiments/async_exp/async_math_ppo.py`:
+
 ```diff
@dataclasses.dataclass
 class AsyncPPOMATHConfig(AsyncRLExperimentConfig, PPOMATHConfig):
+   # New CLI arguments are defined here
+   my_param: float = 1.0
+
    # in realhf/experiments/async_exp/async_ppo_math_exp.py
    @property
    def agent(self) -> AgentAbstraction:
@ -108,14 +101,9 @@ class AsyncPPOMATHConfig(AsyncRLExperimentConfig, PPOMATHConfig):
 -           "math-single-step",
 +           "math-multi-turn",  # Your registered name
            args=dict(
-                gconfig=self.generation_config,
-                tokenizer_path=self.actor.path,
-                success_rate_lb=self.success_rate_lb,
-                success_rate_ub=self.success_rate_ub,
-                reward_scaling=self.ppo.reward_output_scaling,
-                reward_bias=self.ppo.reward_output_bias,
-+                # Any configurations for your agent
-+                ...
+-                ...
+                # Any configurations for your __init__ method
+                my_param=my_param,
            ),
        )

@ -125,7 +113,6 @@ class AsyncPPOMATHConfig(AsyncRLExperimentConfig, PPOMATHConfig):
 -            "math-code-single-step", args=dict(dataset_path=self.dataset.path)
 -        )
 +        # Change to your customized environment if necessary
-+        # The same registration and importing mechanism as Agents
 +        return EnvServiceAbstraction(
 +            "my-env", args=dict(...)
 +        )
@ -133,10 +120,10 @@ class AsyncPPOMATHConfig(AsyncRLExperimentConfig, PPOMATHConfig):

 ## Run Training

-Please follow the guide in [training.md](training.md). Generally, create a YAML configuration called `math-multi-turn.yaml` under `training/configs/async-ppo` and run:
+Please follow the guide in [quickstart](../tutorial/quickstart.md). Generally, start your experiments by running:

 ```bash
-python3 training/main_async_ppo.py --config-name=math-multi-turn
+python3 training/main_async_ppo.py my_param=5.0  # and any additional CLI arguments
 ```

 Happy coding!
--- a/docs/tutorial/installation.md
+++ b/docs/tutorial/installation.md
@ -43,6 +43,9 @@ docker run -it --name areal-node1 \
   --shm-size 700g -v /path/to/mount:/path/to/mount \
   ghcr.io/inclusionai/areal-runtime:v0.3.0 \
   /bin/bash
+git clone https://github.com/inclusionAI/AReaL
+cd AReaL
+bash examples/env/scripts/setup-container-deps.sh
 ```

 ### Option 2: Custom Environment Installation
--- a/docs/tutorial/quickstart.md
+++ b/docs/tutorial/quickstart.md
@ -1,6 +1,6 @@
 # Quickstart

-This guide walks through a simple example of training an LLM to solve math problems.
+This guide walks through a simple example of training an LLM to solve math problems. Please make sure you have properly [installed dependencies and set up the runtime environment](installation.md).

 ## Dataset

--- a/examples/env/scripts/setup-container-deps.sh
+++ b/examples/env/scripts/setup-container-deps.sh
@ -0,0 +1,8 @@
+#!/bin/sh
+AREAL_PATH=$PWD
+cd /sglang
+git apply $AREAL_PATH/patch/sglang/v0.4.6.post4.patch
+cd $AREAL_PATH
+
+# Install AReaL
+pip install -e .
--- a/realhf/system/gserver_manager.py
+++ b/realhf/system/gserver_manager.py
@ -183,7 +183,7 @@ class GserverManager(AsyncWorker):
                        success = res["success"]
                        if success:
                            if "num_paused_requests" in res:
-                                logger.debug(
+                                logger.info(
                                    f"{res['num_paused_requests']} requests are interrupted "
                                    f"during updateing weights for server {server_index}: {server_url}"
                                )
--- a/training/main_async_ppo.py
+++ b/training/main_async_ppo.py
@ -65,7 +65,7 @@ if __name__ == "__main__":

    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("--help", action="store_true")
-    args = parser.parse_args()
+    args = parser.parse_known_args()[0]
    if args.help:
        from realhf.api.cli_args import print_config_help

--- a/training/main_sft.py
+++ b/training/main_sft.py
@ -65,7 +65,7 @@ if __name__ == "__main__":

    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("--help", action="store_true")
-    args = parser.parse_args()
+    args = parser.parse_known_args()[0]
    if args.help:
        from realhf.api.cli_args import print_config_help

--- a/training/main_sync_ppo.py
+++ b/training/main_sync_ppo.py
@ -65,7 +65,7 @@ if __name__ == "__main__":

    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("--help", action="store_true")
-    args = parser.parse_args()
+    args = parser.parse_known_args()[0]
    if args.help:
        from realhf.api.cli_args import print_config_help