Merge branch 'main' of https://code.alipay.com/inclusionAI/AReaL into fw/v0.2.0-readme

2025-03-30 20:23:52 +08:00 · 2025-03-30 20:23:52 +08:00 · bbf61d72c6
parent b54ffe1193 1b0306631b
commit bbf61d72c6
34 changed files with 869 additions and 1348 deletions
--- a/evaluation/README.md
+++ b/evaluation/README.md
@ -16,10 +16,31 @@ pip install prettytable timeout_decorator
 Run evaluation:
 ```bash
 python eval_and_aggregate.py \
--model_path {MODEL_PATH} \
--output_path {OUTPUT_PATH} \
--data_names math_500,aime24,amc23 \
+--model_path ${MODEL_PATH} \
+--output_path ${OUTPUT_PATH} \
+--data_names aime24 \
 --max_gen_tokens 32768 \ # max number of tokens to generate, defaults to 32768
 ```

 The results are saved in `{OUTPUT_PATH}/math_eval_32768`.
+
+Evaluate AReaL-boba-RL-7B:
+```bash
+python eval_and_aggregate.py \
+--model_path ${MODEL_PATH} \
+--output_path ${OUTPUT_PATH} \
+--data_names aime24,aime25 \
+--prompt_type AReaL-boba \
+--output_path outputs --temperature 1.0
+```
+
+Evaluate AReaL-boba-SFT-32B:
+```bash
+python eval_and_aggregate.py \
+--model_path ${MODEL_PATH} \
+--output_path ${OUTPUT_PATH} \
+--data_names aime24,aime25 \
+--prompt_type AReaL-boba-SFT \
+--samples_per_node 2 --num_sample_nodes 16 \
+--output_path outputs --temperature 0.6
+```
--- a/evaluation/data/aime25/test.jsonl
+++ b/evaluation/data/aime25/test.jsonl
@ -0,0 +1,30 @@
+{"problem": "Find the sum of all integer bases $b>9$ for which $17_b$ is a divisor of $97_b.$", "answer": 70, "id": "0"}
+{"problem": "In $\\triangle ABC$ points $D$ and $E$ lie on $\\overline{AB}$ so that $AD < AE < AB$, while points $F$ and $G$ lie on $\\overline{AC}$ so that $AF < AG < AC$. Suppose $AD = 4$, $DE = 16$, $EB = 8$, $AF = 13$, $FG = 52$, and $GC = 26$. Let $M$ be the reflection of $D$ through $F$, and let $N$ be the reflection of $G$ through $E$. The area of quadrilateral $DEGF$ is $288$. Find the area of heptagon $AFNBCEM$.", "answer": 588, "id": "1"}
+{"problem": "The $9$ members of a baseball team went to an ice-cream parlor after their game. Each player had a single scoop cone of chocolate, vanilla, or strawberry ice cream. At least one player chose each flavor, and the number of players who chose chocolate was greater than the number of players who chose vanilla, which was greater than the number of players who chose strawberry. Let $N$ be the number of different assignments of flavors to players that meet these conditions. Find the remainder when $N$ is divided by $1000.$", "answer": 16, "id": "2"}
+{"problem": "Find the number of ordered pairs $(x,y)$, where both $x$ and $y$ are integers between $-100$ and $100$ inclusive, such that $12x^2-xy-6y^2=0$.", "answer": 117, "id": "3"}
+{"problem": "There are $8!= 40320$ eight-digit positive integers that use each of the digits $1, 2, 3, 4, 5, 6, 7, 8$ exactly once. Let $N$ be the number of these integers that are divisible by $22$. Find the difference between $N$ and $2025$.$", "answer": 279, "id": "4"}
+{"problem": "An isosceles trapezoid has an inscribed circle tangent to each of its four sides. The radius of the circle is $3$, and the area of the trapezoid is $72$. Let the parallel sides of the trapezoid have lengths $r$ and $s$, with $r \\neq s$. Find $r^2+s^2$", "answer": 504, "id": "5"}
+{"problem": "The twelve letters $A$,$B$,$C$,$D$,$E$,$F$,$G$,$H$,$I$,$J$,$K$, and $L$ are randomly grouped into six pairs of letters. The two letters in each pair are placed next to each other in alphabetical order to form six two-letter words, and then those six words are listed alphabetically. For example, a possible result is $AB$, $CJ$, $DG$, $EK$, $FL$, $HI$. The probability that the last word listed contains $G$ is $\\frac mn$, where $m$ and $n$ are relatively prime positive integers. Find $m+n$.", "answer": 821, "id": "6"}
+{"problem": "Let $k$ be a real number such that the system \\begin{align*} &|25 + 20i - z| = 5 \\ &|z - 4 - k| = |z - 3i - k| \\end{align*} has exactly one complex solution $z$. The sum of all possible values of $k$ can be written as $\\frac{m}{n}$, where $m$ and $n$ are relatively prime positive integers. Find $m + n$. Here $i = \\sqrt{-1}$.$", "answer": 77, "id": "7"}
+{"problem": "The parabola with equation $y = x^2 - 4$ is rotated $60^\\circ$ counterclockwise around the origin. The unique point in the fourth quadrant where the original parabola and its image intersect has $y$-coordinate $\\frac{a - \\sqrt{b}}{c}$, where $a$, $b$, and $c$ are positive integers, and $a$ and $c$ are relatively prime. Find $a + b + c$.", "answer": 62, "id": "8"}
+{"problem": "The $27$ cells of a $3 \\times 9$ grid are filled in using the numbers $1$ through $9$ so that each row contains $9$ different numbers, and each of the three $3 \\times 3$ blocks heavily outlined in the example below contains $9$ different numbers, as in the first three rows of a Sudoku puzzle. [asy] unitsize(20);  add(grid(9,3));  draw((0,0)--(9,0)--(9,3)--(0,3)--cycle, linewidth(2)); draw((3,0)--(3,3), linewidth(2)); draw((6,0)--(6,3), linewidth(2));  real a = 0.5;  label(\"5\",(a,a)); label(\"6\",(1+a,a)); label(\"1\",(2+a,a)); label(\"8\",(3+a,a)); label(\"4\",(4+a,a)); label(\"7\",(5+a,a)); label(\"9\",(6+a,a)); label(\"2\",(7+a,a)); label(\"3\",(8+a,a));  label(\"3\",(a,1+a)); label(\"7\",(1+a,1+a)); label(\"9\",(2+a,1+a)); label(\"5\",(3+a,1+a)); label(\"2\",(4+a,1+a)); label(\"1\",(5+a,1+a)); label(\"6\",(6+a,1+a)); label(\"8\",(7+a,1+a)); label(\"4\",(8+a,1+a));  label(\"4\",(a,2+a)); label(\"2\",(1+a,2+a)); label(\"8\",(2+a,2+a)); label(\"9\",(3+a,2+a)); label(\"6\",(4+a,2+a)); label(\"3\",(5+a,2+a)); label(\"1\",(6+a,2+a)); label(\"7\",(7+a,2+a)); label(\"5\",(8+a,2+a));  [/asy] The number of different ways to fill such a grid can be written as $p^a \\cdot q^b \\cdot r^c \\cdot s^d$ where $p$, $q$, $r$, and $s$ are distinct prime numbers and $a$, $b$, $c$, $d$ are positive integers. Find $p \\cdot a + q \\cdot b + r \\cdot c + s \\cdot d$.", "answer": 81, "id": "9"}
+{"problem": "A piecewise linear function is defined by\\[f(x) = \\begin{cases} x & \\operatorname{if} ~ -1 \\leq x < 1 \\ 2 - x & \\operatorname{if} ~ 1 \\leq x < 3\\end{cases}\\]and $f(x + 4) = f(x)$ for all real numbers $x$. The graph of $f(x)$ has the sawtooth pattern depicted below. The parabola $x = 34y^{2}$ intersects the graph of $f(x)$ at finitely many points. The sum of the $y$-coordinates of all these intersection points can be expressed in the form $\\tfrac{a + b\\sqrt{c}}{d}$, where $a$, $b$, $c$, and $d$ are positive integers such that $a$, $b$, $d$ have greatest common divisor equal to $1$, and $c$ is not divisible by the square of any prime. Find $a + b + c + d$. Graph [asy] import graph;  size(300);  Label f;  f.p=fontsize(6);  yaxis(-2,2,Ticks(f, 2.0));  xaxis(-6.5,6.5,Ticks(f, 2.0)); draw((0, 0)..(1/4,sqrt(1/136))..(1/2,sqrt(1/68))..(0.75,sqrt(0.75/34))..(1, sqrt(1/34))..(2, sqrt(2/34))..(3, sqrt(3/34))..(4, sqrt(4/34))..(5, sqrt(5/34))..(6, sqrt(6/34))..(7, sqrt(7/34))..(8, sqrt(8/34)), red); draw((0, 0)..(1/4,-sqrt(1/136))..(0.5,-sqrt(1/68))..(0.75,-sqrt(0.75/34))..(1, -sqrt(1/34))..(2, -sqrt(2/34))..(3, -sqrt(3/34))..(4, -sqrt(4/34))..(5, -sqrt(5/34))..(6, -sqrt(6/34))..(7, -sqrt(7/34))..(8, -sqrt(8/34)), red); draw((-7,0)--(7,0), black+0.8bp); draw((0,-2.2)--(0,2.2), black+0.8bp);  draw((-6,-0.1)--(-6,0.1), black); draw((-4,-0.1)--(-4,0.1), black); draw((-2,-0.1)--(-2,0.1), black); draw((0,-0.1)--(0,0.1), black); draw((2,-0.1)--(2,0.1), black); draw((4,-0.1)--(4,0.1), black); draw((6,-0.1)--(6,0.1), black);  draw((-7,1)..(-5,-1), blue); draw((-5,-1)--(-3,1), blue); draw((-3,1)--(-1,-1), blue); draw((-1,-1)--(1,1), blue); draw((1,1)--(3,-1), blue); draw((3,-1)--(5,1), blue); draw((5,1)--(7,-1), blue); [/asy]", "answer": 259, "id": "10"}
+{"problem": "The set of points in $3$-dimensional coordinate space that lie in the plane $x+y+z=75$ whose coordinates satisfy the inequalities\\[x-yz<y-zx<z-xy\\]forms three disjoint convex regions. Exactly one of those regions has finite area. The area of this finite region can be expressed in the form $a\\sqrt{b},$ where $a$ and $b$ are positive integers and $b$ is not divisible by the square of any prime. Find $a+b.$", "answer": 510, "id": "11"}
+{"problem": "Alex divides a disk into four quadrants with two perpendicular diameters intersecting at the center of the disk. He draws $25$ more lines segments through the disk, drawing each segment by selecting two points at random on the perimeter of the disk in different quadrants and connecting these two points. Find the expected number of regions into which these $27$ line segments divide the disk.", "answer": 204, "id": "12"}
+{"problem": "Let $ABCDE$ be a convex pentagon with $AB=14,$ $BC=7,$ $CD=24,$ $DE=13,$ $EA=26,$ and $\\angle B=\\angle E=60^{\\circ}.$ For each point $X$ in the plane, define $f(X)=AX+BX+CX+DX+EX.$ The least possible value of $f(X)$ can be expressed as $m+n\\sqrt{p},$ where $m$ and $n$ are positive integers and $p$ is not divisible by the square of any prime. Find $m+n+p.$", "answer": 60, "id": "13"}
+{"problem": "Let $N$ denote the number of ordered triples of positive integers $(a, b, c)$ such that $a, b, c \\leq 3^6$ and $a^3 + b^3 + c^3$ is a multiple of $3^7$. Find the remainder when $N$ is divided by $1000$.", "answer": 735, "id": "14"}
+{"problem": "Six points $A, B, C, D, E,$ and $F$ lie in a straight line in that order. Suppose that $G$ is a point not on the line and that $AC=26, BD=22, CE=31, DF=33, AF=73, CG=40,$ and $DG=30.$ Find the area of $\\triangle BGE.$", "answer": 468, "id": "15"}
+{"problem": "Find the sum of all positive integers $n$ such that $n + 2$ divides the product $3(n + 3)(n^2 + 9)$.", "answer": 49, "id": "16"}
+{"problem": "Four unit squares form a $2 \\times 2$ grid. Each of the $12$ unit line segments forming the sides of the squares is colored either red or blue in such a say that each unit square has $2$ red sides and $2$ blue sides. One example is shown below (red is solid, blue is dashed). Find the number of such colorings. [asy]         size(4cm);     defaultpen(linewidth(1.2));     draw((0, 0) -- (2, 0) -- (2, 1));     draw((0, 1) -- (1, 1) -- (1, 2) -- (2,2));     draw((0, 0) -- (0, 1), dotted);     draw((1, 0) -- (1, 1) -- (2, 1) -- (2, 2), dotted);     draw((0, 1) -- (0, 2) -- (1, 2), dotted); [/asy]", "answer": 82, "id": "17"}
+{"problem": "The product\\[\\prod^{63}_{k=4} \frac{\\log_k (5^{k^2 - 1})}{\\log_{k + 1} (5^{k^2 - 4})} = \frac{\\log_4 (5^{15})}{\\log_5 (5^{12})} \\cdot \frac{\\log_5 (5^{24})}{\\log_6 (5^{21})}\\cdot \frac{\\log_6 (5^{35})}{\\log_7 (5^{32})} \\cdots \frac{\\log_{63} (5^{3968})}{\\log_{64} (5^{3965})}\\]is equal to $\\tfrac mn,$ where $m$ and $n$ are relatively prime positive integers. Find $m + n.$", "answer": 106, "id": "18"}
+{"problem": "Suppose $\\triangle ABC$ has angles $\\angle BAC = 84^\\circ, \\angle ABC=60^\\circ,$ and $\\angle ACB = 36^\\circ.$ Let $D, E,$ and $F$ be the midpoints of sides $\\overline{BC}, \\overline{AC},$ and $\\overline{AB},$ respectively. The circumcircle of $\triangle DEF$ intersects $\\overline{BD}, \\overline{AE},$ and $\\overline{AF}$ at points $G, H,$ and $J,$ respectively. The points $G, D, E, H, J,$ and $F$ divide the circumcircle of $\\triangle DEF$ into six minor arcs, as shown. Find $\\overarc{DE}+2\\cdot \\overarc{HJ} + 3\\cdot \\overarc{FG},$ where the arcs are measured in degrees.[asy] import olympiad; size(6cm); defaultpen(fontsize(10pt)); pair B = (0, 0), A = (Cos(60), Sin(60)), C = (Cos(60)+Sin(60)/Tan(36), 0), D = midpoint(B--C), E = midpoint(A--C), F = midpoint(A--B); guide circ = circumcircle(D, E, F); pair G = intersectionpoint(B--D, circ), J = intersectionpoints(A--F, circ)[0], H = intersectionpoints(A--E, circ)[0]; draw(B--A--C--cycle); draw(D--E--F--cycle); draw(circ);  dot(A);dot(B);dot(C);dot(D);dot(E);dot(F);dot(G);dot(H);dot(J); label(\"$A$\", A, (0, .8)); label(\"$B$\", B, (-.8, -.8)); label(\"$C$\", C, (.8, -.8)); label(\"$D$\", D, (0, -.8)); label(\"$E$\", E, (.8, .2)); label(\"$F$\", F, (-.8, .2)); label(\"$G$\", G, (0, .8)); label(\"$H$\", H, (-.2, -1));label(\"$J$\", J, (.2, -.8)); [/asy]", "answer": 336, "id": "19"}
+{"problem": "Circle $\\omega_1$ with radius $6$ centered at point $A$ is internally tangent at point $B$ to circle $\\omega_2$ with radius $15$. Points $C$ and $D$ lie on $\\omega_2$ such that $\\overline{BC}$ is a diameter of $\\omega_2$ and ${\\overline{BC} \\perp \\overline{AD}}$. The rectangle $EFGH$ is inscribed in $\\omega_1$ such that $\\overline{EF} \\perp \\overline{BC}$, $C$ is closer to $\\overline{GH}$ than to $\\overline{EF}$, and $D$ is closer to $\\overline{FG}$ than to $\\overline{EH}$, as shown. Triangles $\\triangle {DGF}$ and $\\triangle {CHG}$ have equal areas. The area of rectangle $EFGH$ is $\\frac{m}{n}$, where $m$ and $n$ are relatively prime positive integers. Find $m+n$. [asy] size(5cm); defaultpen(fontsize(10pt)); pair A = (9, 0), B = (15, 0), C = (-15, 0), D = (9, 12), E = (9+12/sqrt(5), -6/sqrt(5)), F = (9+12/sqrt(5), 6/sqrt(5)), G = (9-12/sqrt(5), 6/sqrt(5)), H = (9-12/sqrt(5), -6/sqrt(5)); filldraw(G--H--C--cycle, lightgray); filldraw(D--G--F--cycle, lightgray);  draw(B--C); draw(A--D);  draw(E--F--G--H--cycle);         draw(circle((0,0), 15)); draw(circle(A, 6)); dot(A);  dot(B); dot(C); dot(D);dot(E); dot(F); dot(G); dot(H); label(\"$A$\", A, (.8, -.8)); label(\"$B$\", B, (.8, 0)); label(\"$C$\", C, (-.8, 0)); label(\"$D$\", D, (.4, .8));  label(\"$E$\", E, (.8, -.8));  label(\"$F$\", F, (.8, .8)); label(\"$G$\", G, (-.8, .8)); label(\"$H$\", H, (-.8, -.8)); label(\"$\\omega_1$\", (9, -5)); label(\"$\\omega_2$\", (-1, -13.5)); [/asy]", "answer": 293, "id": "20"}
+{"problem": "Let $A$ be the set of positive integer divisors of $2025$. Let $B$ be a randomly selected subset of $A$. The probability that $B$ is a nonempty set with the property that the least common multiple of its element is $2025$ is $\\frac{m}{n}$, where $m$ and $n$ are relatively prime positive integers. Find $m+n$.", "answer": 237, "id": "21"}
+{"problem": "From an unlimited supply of 1-cent coins, 10-cent coins, and 25-cent coins, Silas wants to find a collection of coins that has a total value of $N$ cents, where $N$ is a positive integer. He uses the so-called greedy algorithm, successively choosing the coin of greatest value that does not cause the value of his collection to exceed $N.$ For example, to get 42 cents, Silas will choose a 25-cent coin, then a 10-cent coin, then 7 1-cent coins. However, this collection of 9 coins uses more coins than necessary to get a total of 42 cents; indeed, choosing 4 10-cent coins and 2 1-cent coins achieves the same total value with only 6 coins. In general, the greedy algorithm succeeds for a given $N$ if no other collection of 1-cent, 10-cent, and 25-cent coins gives a total value of $N$ cents using strictly fewer coins than the collection given by the greedy algorithm. Find the number of values of $N$ between $1$ and $1000$ inclusive for which the greedy algorithm succeeds.", "answer": 610, "id": "22"}
+{"problem": "There are $n$ values of $x$ in the interval $0<x<2\\pi$ where $f(x)=\\sin(7\\pi\\cdot\\sin(5x))=0$. For $t$ of these $n$ values of $x$, the graph of $y=f(x)$ is tangent to the $x$-axis. Find $n+t$.", "answer": 149, "id": "23"}
+{"problem": "Sixteen chairs are arranged in a row. Eight people each select a chair in which to sit so that no person sits next to two other people. Let $N$ be the number of subsets of $16$ chairs that could be selected. Find the remainder when $N$ is divided by $1000$.", "answer": 907, "id": "24"}
+{"problem": "Let $S$ be the set of vertices of a regular $24$-gon. Find the number of ways to draw $12$ segments of equal lengths so that each vertex in $S$ is an endpoint of exactly one of the $12$ segments.", "answer": 113, "id": "25"}
+{"problem": "Let $A_1A_2\\dots A_{11}$ be a non-convex $11$-gon such that The area of $A_iA_1A_{i+1}$ is $1$ for each $2 \\le i \\le 10$, $\\cos(\\angle A_iA_1A_{i+1})=\\frac{12}{13}$ for each $2 \\le i \\le 10$, The perimeter of $A_1A_2\\dots A_{11}$ is $20$. If $A_1A_2+A_1A_{11}$ can be expressed as $\\frac{m\\sqrt{n}-p}{q}$ for positive integers $m,n,p,q$ with $n$ squarefree and $\\gcd(m,p,q)=1$, find $m+n+p+q$.", "answer": 19, "id": "26"}
+{"problem": "Let the sequence of rationals $x_1,x_2,\\dots$ be defined such that $x_1=\\frac{25}{11}$ and\\[x_{k+1}=\\frac{1}{3}\\left(x_k+\\frac{1}{x_k}-1\\right).\\]$x_{2025}$ can be expressed as $\frac{m}{n}$ for relatively prime positive integers $m$ and $n$. Find the remainder when $m+n$ is divided by $1000$.", "answer": 248, "id": "27"}
+{"problem": "Let ${\\triangle ABC}$ be a right triangle with $\\angle A = 90^\\circ$ and $BC = 38.$ There exist points $K$ and $L$ inside the triangle such\\[AK = AL = BK = CL = KL = 14.\\]The area of the quadrilateral $BKLC$ can be expressed as $n\\sqrt3$ for some positive integer $n.$ Find $n.$", "answer": 104, "id": "28"}
+{"problem": "Let\\[f(x)=\\frac{(x-18)(x-72)(x-98)(x-k)}{x}.\\]There exist exactly three positive real values of $k$ such that $f$ has a minimum at exactly two real values of $x$. Find the sum of these three values of $k$.", "answer": 240, "id": "29"}
--- a/evaluation/data/gpqa_diamond/test.jsonl
+++ b/evaluation/data/gpqa_diamond/test.jsonl
--- a/evaluation/eval_and_aggregate.py
+++ b/evaluation/eval_and_aggregate.py
@ -14,7 +14,7 @@ from utils import load_jsonl
 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
-        "--data_names", default="math_500,aime24,amc23", type=lambda x: x.split(",")
+        "--data_names", default="aime24,aime25", type=lambda x: x.split(",")
    )
    parser.add_argument(
        "--model_path",
@ -25,11 +25,13 @@ def parse_args():
    parser.add_argument("--num_sample_nodes", default=8, type=int)
    parser.add_argument("--samples_per_node", default=4, type=int)
    parser.add_argument("--n_sampling", default=32, type=int)
-    parser.add_argument("--prompt_type", default="deepscaler", type=str)
+    parser.add_argument("--prompt_type", default="r1-distilled-qwen", type=str)
    parser.add_argument("--overwrite", action="store_true")
    parser.add_argument("--evaluate_train", action="store_true")
    parser.add_argument("--max_gen_tokens", default=32768, type=int)
-
+    parser.add_argument("--temperature", default=0.6, type=float)
+    parser.add_argument("--top_p", default=0.95, type=float)
+    parser.add_argument("--top_k", default=-1, type=int)
    args = parser.parse_args()
    if args.output_path is None:
        args.output_path = args.model_path
@ -119,10 +121,8 @@ def get_metrics(fname_pattern, tokenizer, is_greedy):

 def process_single_data_name(args, data_name, base_dir, tokenizer):
    cur_dir = os.path.join(base_dir, data_name)
-    greedy_prefix = f"test_{args.prompt_type}_-1_seed0_t0.0_s0_e-1_n1"
-    sampling_prefix = (
-        f"test_{args.prompt_type}_-1_seed*_t0.6_s0_e-1_n{args.samples_per_node}"
-    )
+    greedy_prefix = f"test_{args.prompt_type}_-1_seed0_t0.0_topp1.00_topk-1_s0_e-1_n1"
+    sampling_prefix = f"test_{args.prompt_type}_-1_seed*_t{args.temperature:.1f}_topp{args.top_p:.2f}_topk{args.top_k}_s0_e-1_n{args.samples_per_node}"

    greedy_length_metrics = get_metrics(
        os.path.join(cur_dir, greedy_prefix + ".jsonl"), tokenizer, True
@ -148,25 +148,15 @@ if __name__ == "__main__":
    print(f"Evaluation output to {args.output_path}")
    assert args.num_sample_nodes * args.samples_per_node >= args.n_sampling

-    eval_dir = (
-        "math_eval"
-        if args.max_gen_tokens == 4096
-        else f"math_eval_{args.max_gen_tokens}"
-    )
+    eval_dir = f"math_eval_{args.max_gen_tokens}"

    base_dir = os.path.join(args.output_path, eval_dir)
    os.makedirs(base_dir, exist_ok=True)
    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
-    result_path = os.path.join(base_dir, f"aggregate_parallel_{args.prompt_type}.json")
-
-    if (
-        args.prompt_type == "qwen-boxed"
-        and os.path.exists(os.path.join(base_dir, f"aggregate_parallel.json"))
-        and not os.path.exists(result_path)
-    ):
-        os.system(
-            f'cp {os.path.join(base_dir, f"aggregate_parallel.json")} {result_path}'
-        )
+    result_path = os.path.join(
+        base_dir,
+        f"aggregate_parallel_{args.prompt_type}_{args.temperature:.1f}_{args.top_p:.2f}_{args.top_k}.json",
+    )

    if not os.path.exists(result_path) or args.overwrite or args.evaluate_train:
        log_path = os.path.join(base_dir, "logs")
@ -186,6 +176,7 @@ if __name__ == "__main__":
                stdout=f,
                stderr=f,
            )
+        print(f"Evaluation: greedy finished!")

        for i in range(args.num_sample_nodes):
            with open(
@ -205,11 +196,15 @@ if __name__ == "__main__":
                        ",".join(args.data_names),
                        args.prompt_type,
                        args.output_path,
+                        str(args.temperature),
+                        str(args.top_p),
+                        str(args.top_k),
                    ],
                    text=True,
                    stdout=f,
                    stderr=f,
                )
+            print(f"Evaluation: seed {i + 1} finished!")

        all_results = dict()
        for data_name in args.data_names:
@ -235,5 +230,6 @@ if __name__ == "__main__":
            table.add_row([k, *[round(v[x], 1) for x in field_names[1:]]])

        print(table)
-    except:
+    except ModuleNotFoundError as e:
+
        print(json.dumps(all_results, indent=2))
--- a/evaluation/math_eval.py
+++ b/evaluation/math_eval.py
@ -36,6 +36,7 @@ def parse_args():
    parser.add_argument("--temperature", default=0, type=float)
    parser.add_argument("--n_sampling", default=1, type=int)
    parser.add_argument("--top_p", default=1, type=float)
+    parser.add_argument("--top_k", default=-1, type=int)
    parser.add_argument("--max_tokens_per_call", default=4096, type=int)
    parser.add_argument("--shuffle", action="store_true")
    parser.add_argument("--use_vllm", action="store_true")
@ -43,13 +44,12 @@ def parse_args():
    parser.add_argument("--overwrite", action="store_true")
    parser.add_argument("--use_safetensors", action="store_true")
    parser.add_argument("--num_shots", type=int, default=0)
-    parser.add_argument("--data_parallel_size", type=int, default=1)
    parser.add_argument(
        "--apply_chat_template",
        action="store_true",
        help="Apply chat template to prompt.",
    )
-    parser.add_argument("--pipeline_parallel_size", type=int, default=1)
+    parser.add_argument("--tensor_parallel_size", type=int, default=1)
    parser.add_argument(
        "--adapt_few_shot",
        action="store_true",
@ -59,6 +59,10 @@ def parse_args():
    args.top_p = (
        1 if args.temperature == 0 else args.top_p
    )  # top_p must be 1 when using greedy sampling (vllm)
+    args.top_k = -1 if args.temperature == 0 else args.top_k
+
+    available_gpus = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+    args.data_parallel_size = len(available_gpus) // args.tensor_parallel_size
    return args


@ -160,41 +164,6 @@ def generate_in_parallel(requests, model_args, sampling_params, data_parallel_si
    return undistribute(results)


-from multiprocessing import Pool
-
-import numpy as np
-
-
-def run_inference_one_model_v2(
-    # model_args, sampling_params, requests, cuda_visisble_devices,
-    args,
-):
-    model_args, sampling_params, requests, cuda_visisble_devices = args
-    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
-        [str(x) for x in cuda_visisble_devices]
-    )
-    # print("OS.ENVIRON", json.dumps({x: os.environ[x]  for x in sorted(dict(os.environ))}))
-    llm = LLM(**model_args)
-    return llm.generate(requests, sampling_params=sampling_params)
-
-
-def generate_in_parallel_v2(requests, model_args, sampling_params, data_parallel_size):
-
-    # print("OUT_OS_ENVIRON", json.dumps({x: os.environ[x]  for x in sorted(dict(os.environ))}))
-    all_cuda_visisble_devices = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
-    requests = [list(x) for x in distribute(data_parallel_size, requests)]
-    inputs = (
-        (model_args, sampling_params, req, cuda_visisble_devices)
-        for req, cuda_visisble_devices in zip(
-            requests, np.array_split(all_cuda_visisble_devices, data_parallel_size)
-        )
-    )
-
-    with Pool(processes=128) as pool:
-        results = pool.map(run_inference_one_model_v2, inputs)
-    return undistribute(results)
-
-
 # from more_itertools import distribute
 from itertools import islice, tee

@ -295,16 +264,12 @@ def prepare_data(data_name, args):
    # get out_file name
    dt_string = datetime.now().strftime("%m-%d_%H-%M")
    model_name = "/".join(args.model_name_or_path.split("/")[-2:])
-    out_file_prefix = f"{args.split}_{args.prompt_type}_{args.num_test_sample}_seed{args.seed}_t{args.temperature}"
+    out_file_prefix = f"{args.split}_{args.prompt_type}_{args.num_test_sample}_seed{args.seed}_t{args.temperature:.1f}_topp{args.top_p:.2f}_topk{args.top_k}"
    output_dir = args.output_dir
    if not os.path.exists(output_dir):
        output_dir = f"outputs/{output_dir}"

-    eval_dir = (
-        "math_eval"
-        if args.max_tokens_per_call == 4096
-        else f"math_eval_{args.max_tokens_per_call}"
-    )
+    eval_dir = f"math_eval_{args.max_tokens_per_call}"

    out_file = f"{output_dir}/{eval_dir}/{data_name}/{out_file_prefix}_s{args.start}_e{args.end}_n{args.n_sampling}.jsonl"
    os.makedirs(f"{output_dir}/{eval_dir}/{data_name}", exist_ok=True)
@ -339,31 +304,34 @@ def setup(args):
        if args.data_parallel_size <= 1:
            llm = LLM(
                model=args.model_name_or_path,
-                tensor_parallel_size=len(available_gpus) // args.pipeline_parallel_size,
-                # distributed_executor_backend="ray",
-                pipeline_parallel_size=args.pipeline_parallel_size,
-                enforce_eager=True,
-                max_model_len=32768,
+                tensor_parallel_size=args.tensor_parallel_size,
+                distributed_executor_backend="ray",
                trust_remote_code=True,
-                swap_space=16,
+                enforce_eager=True,
+                # dtype="float16",
+                disable_custom_all_reduce=True,
+                disable_sliding_window=True,
+                max_model_len=32768,
+                enable_chunked_prefill=False,
+                swap_space=32,
            )
        else:
            print(
-                f"TP = {len(available_gpus) // (args.pipeline_parallel_size * args.data_parallel_size)}\n",
-                f"PP = {args.pipeline_parallel_size}\n",
+                f"TP = {args.tensor_parallel_size}\n",
                f"DP = {args.data_parallel_size}",
            )
            llm = dict(
                model=args.model_name_or_path,
-                tensor_parallel_size=len(available_gpus)
-                // (args.pipeline_parallel_size * args.data_parallel_size),
-                # distributed_executor_backend="ray",
-                pipeline_parallel_size=args.pipeline_parallel_size,
+                tensor_parallel_size=args.tensor_parallel_size,
+                distributed_executor_backend="ray",
                trust_remote_code=True,
-                disable_custom_all_reduce=True,
                enforce_eager=True,
+                # dtype="float16",
+                disable_custom_all_reduce=True,
+                disable_sliding_window=True,
                max_model_len=32768,
-                swap_space=16,
+                enable_chunked_prefill=False,
+                swap_space=32,
            )
        tokenizer = None
        if args.apply_chat_template:
@ -513,6 +481,7 @@ def main(llm, tokenizer, data_name, args):
                temperature=args.temperature,
                seed=args.seed,
                top_p=args.top_p,
+                top_k=args.top_k,
                max_tokens=args.max_tokens_per_call,
                n=args.n_sampling,
                stop=stop_words,
@ -653,7 +622,8 @@ def main(llm, tokenizer, data_name, args):
            result_json[f"pass@16"] = pass_at_k_v2(all_samples, k=16)
        if args.n_sampling > 8:
            result_json[f"pass@8"] = pass_at_k_v2(all_samples, k=8)
-        result_json[f"pass@1"] = pass_at_k_v2(all_samples, k=1)
+        result_json["pass@1"] = pass_at_k_v2(all_samples, k=1)
+        result_json["acc"] = result_json["pass@1"]

    # save outputs
    if len(processed_samples) < len(all_samples) and args.save_outputs:
--- a/evaluation/parser.py
+++ b/evaluation/parser.py
@ -632,6 +632,8 @@ def parse_ground_truth(example: Dict[str, Any], data_name):
        gt_cot, gt_ans = None, example["final_answer"][0].strip("$")
    elif data_name in [
        "aime24",
+        "aime25",
+        "gpqa_diamond",
        "amc23",
        "cmath",
        "gaokao2024_I",
--- a/evaluation/sh/eval_greedy.sh
+++ b/evaluation/sh/eval_greedy.sh
@ -7,7 +7,7 @@ export VLLM_LOGGING_LEVEL=DEBUG
 MODEL_NAME_OR_PATH=$1
 # OUTPUT_DIR=$1
 MAX_GEN_TOKENS=${2:-4096}
-DATA_NAME=${3:-"math_500,math,gsm8k,train_amc_aime,aime24,amc23"}
+DATA_NAME=${3:-"aime24,aime23"}
 PROMPT_TYPE=${4:-"qwen-boxed"}

 SPLIT="test"
@ -33,11 +33,7 @@ python3 -u math_eval.py \
    --end -1 \
    --use_vllm \
    --max_tokens_per_call=$MAX_GEN_TOKENS \
-    --data_parallel_size 8 \
+    --tensor_parallel_size 2 \
    --save_outputs \
    # --overwrite \

-chown -R admin ${OUTPUT_DIR}/math_eval
-
-ray stop
-exit
--- a/evaluation/sh/eval_sample_with_seed.sh
+++ b/evaluation/sh/eval_sample_with_seed.sh
@ -17,6 +17,10 @@ MAX_GEN_TOKENS=${4:-4096}
 DATA_NAME=${5:-"math_500,math,gsm8k,train_amc_aime,aime24,amc23"}
 PROMPT_TYPE=${6:-"qwen-boxed"}
 OUTPUT_DIR=${7:-$MODEL_NAME_OR_PATH}
+temperature=${8:-"1.0"}
+top_p=${9:-"1.0"}
+top_k=${10:-"-1"}
+

 # English open datasets
 # DATA_NAME="math_500,math,gsm8k,train_amc_aime,aime24,amc23"
@ -30,14 +34,15 @@ python3 -u math_eval.py \
    --prompt_type ${PROMPT_TYPE} \
    --num_test_sample ${NUM_TEST_SAMPLE} \
    --seed ${SEED} \
-    --temperature 0.6 \
+    --temperature $temperature \
    --n_sampling $n_sampling \
-    --top_p 0.95 \
+    --top_p $top_p \
+    --top_k $top_k \
    --start 0 \
    --end -1 \
    --use_vllm \
    --max_tokens_per_call=$MAX_GEN_TOKENS \
-    --data_parallel_size 2 \
+    --tensor_parallel_size 4 \
    --save_outputs \
    # --overwrite \

--- a/evaluation/utils.py
+++ b/evaluation/utils.py
@ -143,11 +143,26 @@ PROMPT_TEMPLATES = {
        "{output}",
        "\n\n",
    ),
-    "deepscaler": (
+    "AReaL-boba-SFT": (
        "<｜begin▁of▁sentence｜><｜User｜>{input}\nPlease reason step by step, and put your final answer within \\boxed{{}}.<｜Assistant｜><think>\n",
        "{output}",
        "\n\n",
    ),
+    "AReaL-boba": (
+        "<｜User｜>{input}\nPlease reason step by step, and put your final answer within \\boxed{{}}.<｜Assistant｜><think>\n",
+        "{output}",
+        "\n\n",
+    ),
+    "r1-distilled-qwen": (
+        "<｜begin▁of▁sentence｜><｜User｜>{input}\nPlease reason step by step, and put your final answer within \\boxed{{}}.<｜Assistant｜><think>\n",
+        "{output}",
+        "\n\n",
+    ),
+    "r1-distilled-qwen-gpqa": (
+        "<｜User｜>{input}\nPlease reason step-by-step and put your choice letter without any other text with \\boxed{{}} in the end.<｜Assistant｜><think>\n",
+        "{output}",
+        "\n\n",
+    ),
    "r1-zero": (
        "A conversation between User and Assistant. The user asks a question, and the Assistant solves it."
        "The assistant first thinks about the reasoning process in the mind and then provides the user with the answer."
--- a/examples/README.md
+++ b/examples/README.md
@ -34,7 +34,7 @@ This tutorial provides a Docker image. Below are the tested software versions:
 | Git LFS | Refer to: https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage. Mainly used for downloading models, datasets, and AReaL project code. |
 | Docker | 27.5.1 |
 |NVIDIA Container Toolkit|[Installing the NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)|
-| AReaL Image | `ghcr.io/inclusionai/areal-runtime:v0.1.0`. This image includes AReaL's runtime dependencies and Ray components. |
+| AReaL Image | `ghcr.io/inclusionai/areal-runtime:v0.2.0`. This image includes AReaL's runtime dependencies and Ray components. |

 Since the installation of NVIDIA Drivers and CUDA, as well as the mounting of shared storage, depends on node configurations and system versions, please complete these installations independently. This tutorial does not cover their setup.

@ -76,7 +76,7 @@ If the script in this section fails to execute or encounters errors due to envir

 Since shared storage is used, downloading only needs to be done on one node.

-## Code and Cluster Configuration
+## Code
 Clone the AReaL project code to `/storage/codes`:


@ -86,27 +86,6 @@ cd /storage/codes/
 git clone https://github.com/inclusionAI/AReaL
 ```

-Create the cluster configuration file `/storage/ray/cluster_config_on_ray.json`:
-
-```bash
-mkdir -p /storage/ray/
-cd /storage/ray/
-```
-
-Write the following configuration to `/storage/ray/cluster_config_on_ray.json`:
-
-```
-{
-    "cluster_type": "ray",
-    "cluster_name": "ray_cluster",
-    "fileroot": "/storage/ray/experiments",
-    "default_mount": "/storage:/storage",
-    "n_gpus_per_node": 8
-}
-```
-
-This configuration file describes the cluster where AReaL training job runs. In particular, the fileroot path is where logs and checkpoints are stored during training.
-
 ## Dataset

 We provide a dataset for training. Download the dataset and place it in `/storage/datasets/`:
@ -114,8 +93,8 @@ We provide a dataset for training. Download the dataset and place it in `/storag
 ```bash
 mkdir -p /storage/datasets/
 cd /storage/datasets/
-wget https://huggingface.co/datasets/inclusionAI/AReaL-RL-Data/resolve/main/data/full_prompts_for_r1_distilled.jsonl?download=true
-wget https://huggingface.co/datasets/inclusionAI/AReaL-RL-Data/resolve/main/data/full_orz_zero.jsonl?download=true
+wget https://huggingface.co/datasets/inclusionAI/AReaL-RL-Data/resolve/main/data/boba_106k_0319.jsonl?download=true
+wget https://huggingface.co/datasets/inclusionAI/AReaL-RL-Data/resolve/main/data/orz-zero_56k_0319.jsonl?download=true
 ```

 ## Model
@ -127,6 +106,7 @@ mkdir -p /storage/models
 cd /storage/models
 GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
 ```

 You can also use the HuggingFace CLI to download after installing PyPI and huggingface_hub. Refer to the [official documentation](https://huggingface.co/docs/huggingface_hub/guides/cli) for details.
@ -138,7 +118,7 @@ Before proceeding, pull the AReaL environment image, which already includes Ray
 On the first node, start the Ray Head with the following command:

 ```bash
-docker run -d --name r1-ray-head --privileged --gpus all --network host --shm-size 700g -v /storage:/storage ghcr.io/inclusionai/areal-runtime:v0.1.0 /bin/bash -c "ray start --head --port=6379 && tail -f /dev/null"
+docker run -d --name r1-ray-head --privileged --gpus all --network host --shm-size 700g -v /storage:/storage ghcr.io/inclusionai/areal-runtime:v0.2.0 /bin/bash -c "ray start --head --port=6379 && tail -f /dev/null"
 ```

 On all other nodes, start the Ray Worker with the following command (skip this step if you only have one node):
@ -146,7 +126,7 @@ On all other nodes, start the Ray Worker with the following command (skip this s
 ```bash
 # RAY_HEAD_IP is the IP of the first node
 RAY_HEAD_IP=xxx.xxx.xxx.xxx
-docker run -d --name r1-ray-worker --privileged --gpus all --network host --shm-size 700g -v /storage:/storage ghcr.io/inclusionai/areal-runtime:v0.1.0 /bin/bash -c "ray start --address=$RAY_HEAD_IP:6379 && tail -f /dev/null"
+docker run -d --name r1-ray-worker --privileged --gpus all --network host --shm-size 700g -v /storage:/storage ghcr.io/inclusionai/areal-runtime:v0.2.0 /bin/bash -c "ray start --address=$RAY_HEAD_IP:6379 && tail -f /dev/null"
 ```

 Once all nodes are up, check the Ray cluster status by entering the container on the first node:
@ -198,87 +178,45 @@ Demands:

 # RL Trainig

-## Single-Node Training
-
-For a single node, execute the following command to start training:
-
-```bash
-docker exec -it r1-ray-head bash
-cd /storage/codes/AReaL
-mkdir /storage/ray/train_batch_logs/
-nohup bash ./examples/train_batch_1.5B_n1.sh &> /storage/ray/train_batch_logs/n1.log &
-```
-
-After starting, check the training launch information in the log file `/storage/ray/train_batch_logs/n1.lo`g:
-
-```
-Log Dir: /storage/ray/train_batch_logs/ppo-zero-distill-1.5B-n1/20250222-104411
-Task Count: 1
-2025-02-22 10:44.11 Task 0 started: ppo-zero-distill-1.5B-n1 deepseek-ai__DeepSeek-R1-Distill-Qwen-1.5B prompts.jsonl 1024 8 1 actor_gen:d4p1m2,*:d4p2m1 16384 128 1 0.001
-```
-
-Based on the Log Dir, you can check the specific logs for the currently running training task. The log path is `{Log Dir}/{task_id}.log`. For example, `/storage/ray/train_batch_logs/ppo-zero-distill-1.5B-n1/20250222-104411/0.log`:
-
-```
-20250222-10:44:15.581 quickstart INFO: Running ppo-math experiment.
-20250222-10:44:15.581 quickstart INFO: Logs will be dumped to /storage/ray/experiments/logs/root/ppo-zero-distill-1.5B-n1/1024x8-n1
-20250222-10:44:15.581 quickstart INFO: Model checkpoints will be saved to /storage/ray/experiments/checkpoints/root/ppo-zero-distill-1.5B-n1/1024x8-n1
-20250222-10:44:17.100 quickstart INFO: Launching experiments with RAY...
-```
-
-If errors occur during execution (e.g., keywords like "Error" appear), refer to the troubleshooting section.
-
-## Distributed Training
-
 Before starting distributed training, ensure the Ray cluster is up and running properly.
 Then, on the first node (where the Ray Head is located), enter the container:

 ```
 docker exec -it r1-ray-head bash
 cd /storage/codes/AReaL
-mkdir /storage/ray/train_batch_logs/
 ```

-Choose a task that matches your hardware environment and run it:
+Choose a config file that matches your hardware environment and run it:

 ```bash
-# For 1.5B model on 4 nodes, log file is n4.log
-nohup bash ./examples/train_batch_1.5B_n4.sh &> /storage/ray/train_batch_logs/n4.log &
-# For 1.5B model on 16 nodes, log file is n16.log
-nohup bash ./examples/train_batch_1.5B_n16.sh &> /storage/ray/train_batch_logs/n16.log &
-# For 7B model on 4 nodes, log file is 7n4.log
-nohup bash ./examples/train_batch_7B_n4.sh &> /storage/ray/train_batch_logs/7n4.log &
-# For 7B model on 16 nodes, log file is 7n16.log
-nohup bash ./examples/train_batch_7B_n16.sh &> /storage/ray/train_batch_logs/7n16.log &
+python3 -m realhf.apps.quickstart ppo-math --config ./examples/configs/7B-distill/ppo-7B-distill-gpus-128.yaml
 ```

-After starting, check the training launch information in the log file `/storage/ray/train_batch_logs/{corresponding log file name}.log` (e.g., `7n16.log`):
+After starting, check the training launch information:

 ```
-Log Dir: /storage/ray/train_batch_logs/ppo-zero-distill-7B-n16/20250222-102631
-Task Count: 1
-2025-02-22 10:26.31 Task 0 started: ppo-zero-distill-7B-n16 deepseek-ai__DeepSeek-R1-Distill-Qwen-7B prompts_7b_progress_20k.jsonl 1024 16 16 vllm.d16p1m4+d32p2m1 16384 128 4 0.01
-```
+              ╭─────────────────────────────────────────────────╮               
+              │ Setting PPOMATHConfig with the Following Values │               
+              ╰─────────────────────────────────────────────────╯               

-Based on the Log Dir, you can check the specific logs for the currently running training task. The log path is `{Log Dir}/{task_id}.log`. For example, `/storage/ray/train_batch_logs/ppo-zero-distill-7B-n16/20250222-102631/0.log`:
-
-```
+───────────────────────── Current Configuration Begin ──────────────────────────
+actor (ModelTrainEvalConfig)
+    actor.type (ModelFamily)
+        actor.type._class (str) - qwen2
+        actor.type.size (int) - 7
+        actor.type.is_critic (bool) - False
+...
+────────────────────────── Current Configuration End ───────────────────────────
+ 
 20250222-10:26:34.877 quickstart INFO: Running ppo-math experiment.
-20250222-10:26:34.877 quickstart INFO: Logs will be dumped to /storage/ray/experiments/logs/root/ppo-zero-distill-7B-n16/1024x16-n16
-20250222-10:26:34.877 quickstart INFO: Model checkpoints will be saved to /storage/ray/experiments/checkpoints/root/ppo-zero-distill-7B-n16/1024x16-n16
+20250222-10:44:15.581 quickstart INFO: Logs will be dumped to /storage/ray/experiments/logs/root/ppo-7B-distill-gpus-128/512x16
+20250222-10:44:15.581 quickstart INFO: Model checkpoints will be saved to /storage/ray/experiments/checkpoints/root/ppo-7B-distill-gpus-128/512x16
 20250222-10:26:36.408 quickstart INFO: Launching experiments with RAY...
 ```

 If errors occur during execution (e.g., keywords like "Error" appear), refer to the troubleshooting section.

 ## Commandline Options
-The `./examples/train_batch_{1.5/7}B_n{1/4/16}.sh` scripts contain pre-configured training parameters, and all of these scripts ultimately launch the training using the following command:
-
-```bash
-python3 -m realhf.apps.quickstart ppo-math option1=arg1 option2=arg2 ...
-```
-
-The command-line arguments like `option1=arg1` are parsed by [hydra](https://hydra.cc/), and each configuration item is a `dataclasses.dataclass` in the Python code. You can use the following command to view all the command-line arguments that can be passed in the experiment:

 ```bash
 python3 -m realhf.apps.quickstart ppo-math --help
@ -287,17 +225,15 @@ python3 -m realhf.apps.quickstart ppo-math --help
 The descriptions of the important parameters are as follows:


-+ `MODE`: It is always `ray`, and do not change it to other values when referring to this tutorial for training.
-+ `BASE_MODEL_PATH`: The path of the model.
-+ `DATA_PATH`: The path of the dataset jsonl file
-+ `CLUSTER_SPEC_PATH`: Set it to the path of cluster_config.json
+ `mode`: It is always `ray`, and do not change it to other values when referring to this tutorial for training.
+ `{actor|critic|ref}.path`: The path of the model.
+ `dataset.path`: The path of the dataset jsonl file
+ `external_configs.cluster_config`: Set config for cluster_config. e.g. fileroot is the root path for saving traning outputs.

 + `n_nodes`: The number of nodes
 + `n_gpus_per_node`: The number of GPUs per node
-+ `allocation_mode`: The GPU allocation and 3D parallel strategy of the model in the experiment, mainly in the following two forms:
-	+ `d${DP}m${TP}p${PP}`: Where the three integers DP, TP, and PP respectively represent the degrees of data parallelism, tensor parallelism, and pipeline parallelism, and the product of the three integers should be equal to the total number of GPUs (i.e. DPxTPxPP=#GPUs). In this configuration, generation and training use the entire GPU cluster and the same parallel strategy. If you want to use vLLM for generation, you also need to set `actor.vllm.hybrid_train=True` and `actor.vllm.enforce_eager=True`. Note that PP must be 1 (vLLM does not support PP temporarily). 
-    + `actor_gen:d${DP1}p${TP1}m{PP1},*:d{DP2}p{PP2}m{MP2}`: Setting parallel stratgies for generation and training separately. Generation and training use the entire GPU cluster but they could use different parallel strategies. The configuration must satisfy DP1xTP1xPP1=DP2xPP2xMP2=#GPU. If you want to use vLLM for generation, you also need to set `actor.vllm.hybrid_train=True` and `actor.vllm.enforce_eager=True`. Note that PP1 must be 1 (vLLM does not support PP temporarily).
-	+ `vllm.d${DP1}m${TP1}p${PP1}+d${DP2}m${TP2}p${PP2}`: Configure the parallel strategies for vLLM generation and training respectively. The generation and training use disjoint sets of GPUs, and the sum of the number of GPUs used by the two should be equal to the total number of GPUs, i.e DP1xTP1xPP1+DP2xTP2xPP2=#GPUs. If you want to use vLLM for generation, you have to set `actor.vllm.hybrid_train=False` and PP1=1. It is recommended to set `actor.vllm.enforce_eager=False` to accelerate vLLM generation. 
+ `allocation_mode`: The GPU allocation and 3D parallel strategy of the model in the experiment, mainly in the following form:
+	+ `sglang.d${DP1}m${TP1}p${PP1}+d${DP2}m${TP2}p${PP2}`: Configure the parallel strategies for SGLang generation and training respectively. The generation and training use disjoint sets of GPUs, and the sum of the number of GPUs used by the two should be equal to the total number of GPUs, i.e DP1xTP1xPP1+DP2xTP2xPP2=#GPUs.

 + `exp_ctrl.total_train_epochs`: The number of training epochs (i.e., the number of times to iterate over the entire dataset)
 + `exp_ctrl.save_freq_{epochs|steps|secs}`: The frequency of saving the model parameters in persistent storage. If it is set to null, the model will not be saved.
@ -318,7 +254,6 @@ Here, we use the logs from a 16-node run (the same applies to 1-node and 4-node
 Search for the keyword `Epoch` in the logs to see the total number of Epochs and Steps:

 ```bash
-# grep "Epoch" /storage/ray/train_batch_logs/ppo-zero-distill-7B-n16/20250222-102631/0.log
 (master_worker/0 pid=96390, ip=xxx.xxx.xxx.xxx) 20250222-11:11:56.997 master worker INFO: Epoch 1/1 step 1/19 (global step 1) finishes. Average #tokens per batch is 111847. #End to end# execution time: *2124.429*s. Total time consumption: 2283.862s. 
 (master_worker/0 pid=96390, ip=xxx.xxx.xxx.xxx) 20250222-11:52:02.719 master worker INFO: Epoch 1/1 step 2/19 (global step 2) finishes. Average #tokens per batch is 111847. #End to end# execution time: *2405.716*s. Total time consumption: 4689.584s. 
 (master_worker/0 pid=96390, ip=xxx.xxx.xxx.xxx) 20250222-12:27:25.084 master worker INFO: Epoch 1/1 step 3/19 (global step 3) finishes. Average #tokens per batch is 111847. #End to end# execution time: *2122.318*s. Total time consumption: 6811.949s. Estimated remaining time: 33957.093s. 
@ -342,7 +277,6 @@ Search for the keyword `task_reward` in the logs.


 ```bash
-# grep "task_reward" /storage/ray/train_batch_logs/ppo-zero-distill-7B-n16/20250222-102631/0.log
 (master_worker/0 pid=96390, ip=xxx.xxx.xxx.xxx) 20250222-11:11:56.991 master worker INFO: RPC name actor_train returns {'ppo_approx_kl': -2.2640759198111482e-05, 'actor_loss': 1.1128166761409375e-06, 'actor_clip_ratio': 2.1122002635820536e-07, 'importance_weight': 1.0000014305114746, 'task_reward': -0.2996826171875, 'kl_reward': -2.27004832709099e-07, 'final_reward': -0.30145370960235596, 'advantage': 0.003593671601265669, 'avg_seq_len': 7907.8955078125, 'avg_prompt_len': 105.845703125, 'n_tokens': 127828786.0, 'n_valid_tokens': 127828786.0, 'n_seqs': 16384.0, 'no_eos_ratio': 0.122802734375, 'disable_value': 1.0, 'mask_no_eos_with_zero': 0.0}
 (master_worker/0 pid=96390, ip=xxx.xxx.xxx.xxx) 20250222-11:52:02.712 master worker INFO: RPC name actor_train returns {'ppo_approx_kl': -2.493159263394773e-05, 'actor_loss': -3.846728588996484e-07, 'actor_clip_ratio': 3.16789424914532e-07, 'importance_weight': 0.9999996423721313, 'task_reward': -0.6793212890625, 'kl_reward': -2.536311853873485e-07, 'final_reward': -0.6813737154006958, 'advantage': 0.004844569601118565, 'avg_seq_len': 8203.9453125, 'avg_prompt_len': 111.892578125, 'n_tokens': 132580185.0, 'n_valid_tokens': 132580185.0, 'n_seqs': 16384.0, 'no_eos_ratio': 0.13812255859375, 'disable_value': 1.0, 'mask_no_eos_with_zero': 0.0}
 (master_worker/0 pid=96390, ip=xxx.xxx.xxx.xxx) 20250222-12:27:25.077 master worker INFO: RPC name actor_train returns {'ppo_approx_kl': -2.572356243035756e-05, 'actor_loss': -5.036404786551429e-07, 'actor_clip_ratio': 1.8960582792715286e-07, 'importance_weight': 0.9999992251396179, 'task_reward': -0.6280517578125, 'kl_reward': -2.988609537624143e-07, 'final_reward': -0.6303607225418091, 'advantage': 0.004505862481892109, 'avg_seq_len': 7834.6328125, 'avg_prompt_len': 108.900390625, 'n_tokens': 126578395.0, 'n_valid_tokens': 126578395.0, 'n_seqs': 16384.0, 'no_eos_ratio': 0.11761474609375, 'disable_value': 1.0, 'mask_no_eos_with_zero': 0.0}
@ -367,7 +301,7 @@ The evaluation code is located in the `evaluation` folder of the repository. As

 Start a new container to execute the evaluation script (note: evaluation requires updates to certain Python libraries; avoid using the training container for this task):
 ```
-docker run -d --name r1-eval --privileged --gpus all --network host --shm-size 700g -v /storage:/storage ghcr.io/inclusionai/areal-runtime:v0.1.0 /bin/bash -c "tail -f /dev/null"
+docker run -d --name r1-eval --privileged --gpus all --network host --shm-size 700g -v /storage:/storage ghcr.io/inclusionai/areal-runtime:v0.2.0 /bin/bash -c "tail -f /dev/null"
 docker exec -it r1-eval bash
 ```

@ -431,48 +365,26 @@ The runtime of the evaluation depends on factors such as the maximum generation

 If the following content does not address your issue, feel free to raise a GitHub Issue.

+
 ## Automatic Recover

-### How to
+When setting `recover_mode=auto` and the experiment config remains the same, AReaL will try to discover previous checkpoints and recover the experiment from it.

-The training is exclusively initiated through the ./examples/train_batch_{1.5/7}B_n{1/4/16}.sh scripts. These scripts include parameter entries in the format below and automatically exit upon completing the parameter set execution:
+If the automatic recover fails, please check the following possibilities:

-```bash
-ALL_PARAMS=(
-    "${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 16 ${NODES} ${ALLOCATION_MODE} 16384 128 4 0.01"
-)
-```
+* The `experiment_name` and `trial_name` in the training script differ from the previous run.

-Training may abort due to OOM errors or hardware failures. In such scenarios, manually rerunning the train_batch script will automatically resume from the latest recoverable checkpoint.
-
-For persistent failures requiring frequent manual intervention, modify the train_batch script by duplicating identical parameter sets to enable automated retries. For instance, to implement 3 retry attempts, configure three identical parameter groups as demonstrated:
-
-```bash
-ALL_PARAMS=(
-    "${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 16 ${NODES} ${ALLOCATION_MODE} 16384 128 4 0.01"
-    "${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 16 ${NODES} ${ALLOCATION_MODE} 16384 128 4 0.01"
-    "${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 16 ${NODES} ${ALLOCATION_MODE} 16384 128 4 0.01"
-)
-```
-
-### Why does the training task restart from the beginning instead of resuming from the last Step?
-
-Check the following possibilities:
-
-* The EXP_NAME and TRIAL_NAME in the training script differ from the previous run.
-
-* Changes in Batch Size (1024 in the parameters), Group Size (16 in the parameters), or the number of nodes (${NODES} in the parameters).
+* Changes in Batch Size (`dataset.train_bs_n_seqs` in the parameters), Group Size (`group_size` in the parameters), or the number of nodes (`n_nodes` in the parameters).

 * No recover checkpoint was created in the previous run. By default, recover checkpoints are generated under two conditions:

 	* After the completion of the second Step.

-	* When a Step completes and more than 600 seconds have passed since the last recover checkpoint. This parameter is in the `examples/train_{tiny|small|large}_on_ray.sh` script, named `exp_ctrl.ckpt_freq_secs=600`.
+	* When a Step completes and more than 600 seconds have passed since the last recover checkpoint. This parameter is in the `./examples/configs/*/*.yaml`, named `exp_ctrl.ckpt_freq_secs=600`.

 You can confirm if a recover checkpoint was generated by searching in the log:

 ```bash
-# grep "Dumped recover" /storage/ray/train_batch_logs/ppo-zero-distill-7B-n16/20250222-102631/0.log
 (master_worker/0 pid=96390, ip=xxx.xxx.xxx.xxx) 20250222-11:52:02.760 master worker INFO: Dumped recover info to file.
 (master_worker/0 pid=96390, ip=xxx.xxx.xxx.xxx) 20250222-12:27:25.105 master worker INFO: Dumped recover info to file.
 (master_worker/0 pid=96390, ip=xxx.xxx.xxx.xxx) 20250222-13:05:58.264 master worker INFO: Dumped recover info to file.
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@ -42,7 +42,7 @@
 |Git LFS|参考：[Git LFS 安装指南](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage) 主要用于下载模型，数据集，AReaL 工程代码|
 |Docker|版本：27.5.1|
 |NVIDIA Container Toolkit|[NVIDIA Container Toolkit 安装指南](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)|
-|镜像|ghcr.io/inclusionai/areal-runtime:v0.1.0 这个镜像中包含运行依赖和 Ray 的相关组件|
+|镜像|ghcr.io/inclusionai/areal-runtime:v0.2.0 这个镜像中包含运行依赖和 Ray 的相关组件|


 由于 NVIDIA Driver 和 CUDA 的安装以及共享存储的挂载与节点和系统版本有关，请自行完成安装，本教程不进行介绍。
@ -84,7 +84,7 @@ python ./examples/env/setup_env_and_start_train.py setup --private_key_file /pat

 由于使用了共享存储，下载操作只需要在一个节点上完成。

-## 代码和集群配置
+## 代码
 将 AReaL 项目代码克隆到 `/storage/codes` 中：


@ -94,34 +94,14 @@ cd /storage/codes/
 git clone https://github.com/inclusionAI/AReaL.git
 ```

-创建集群配置文件 `/storage/ray/cluster_config_on_ray.json`：
-```bash
-mkdir -p /storage/ray/
-cd /storage/ray/
-```
-
-将以下配置写入到 `/storage/ray/cluster_config_on_ray.json`：
-
-```
-{
-    "cluster_type": "ray",
-    "cluster_name": "ray_cluster",
-    "fileroot": "/storage/ray/experiments",
-    "default_mount": "/storage:/storage",
-    "n_gpus_per_node": 8
-}
-```
-
-集群配置文件是运行 AReaL 训练任务的描述文件。其中 fileroot 所指向的路径是训练过程中日志，checkpoint 的存储路径。
-
 ## 数据集

 我们提供了用于训练的数据集，请下载数据集并放置在 /storage/datasets/
 ```bash
 mkdir -p /storage/datasets/
 cd /storage/datasets/
-wget https://huggingface.co/datasets/inclusionAI/AReaL-RL-Data/resolve/main/data/full_prompts_for_r1_distilled.jsonl?download=true
-wget https://huggingface.co/datasets/inclusionAI/AReaL-RL-Data/resolve/main/data/full_orz_zero.jsonl?download=true
+wget https://huggingface.co/datasets/inclusionAI/AReaL-RL-Data/resolve/main/data/boba_106k_0319.jsonl?download=true
+wget https://huggingface.co/datasets/inclusionAI/AReaL-RL-Data/resolve/main/data/orz-zero_56k_0319.jsonl?download=true
 ```

 ## 模型
@ -132,6 +112,7 @@ mkdir -p /storage/models
 cd /storage/models
 GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
 ```

 你也可以在安装 PyPI 和 huggingface_hub 后利用 huggingface CLI 进行下载，具体请参考[官方文档](https://huggingface.co/docs/huggingface_hub/guides/cli)
@ -144,7 +125,7 @@ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-R1-D
 在第一个节点上执行如下命令启动 Ray Head：

 ```bash
-docker run -d --name r1-ray-head --privileged --gpus all --network host --shm-size 700g -v /storage:/storage ghcr.io/inclusionai/areal-runtime:v0.1.0 /bin/bash -c "ray start --head --port=6379 && tail -f /dev/null"
+docker run -d --name r1-ray-head --privileged --gpus all --network host --shm-size 700g -v /storage:/storage ghcr.io/inclusionai/areal-runtime:v0.2.0 /bin/bash -c "ray start --head --port=6379 && tail -f /dev/null"
 ```

 在除了第一个节点以外的每个节点上执行如下命令启动 Ray Worker（如果只有一个节点，这一步就不用执行了）：
@ -152,7 +133,7 @@ docker run -d --name r1-ray-head --privileged --gpus all --network host --shm-si
 ```bash
 # RAY_HEAD_IP 是第一个节点的 IP
 RAY_HEAD_IP=xxx.xxx.xxx.xxx
-docker run -d --name r1-ray-worker --privileged --gpus all --network host --shm-size 700g -v /storage:/storage ghcr.io/inclusionai/areal-runtime:v0.1.0 /bin/bash -c "ray start --address=$RAY_HEAD_IP:6379 && tail -f /dev/null"
+docker run -d --name r1-ray-worker --privileged --gpus all --network host --shm-size 700g -v /storage:/storage ghcr.io/inclusionai/areal-runtime:v0.2.0 /bin/bash -c "ray start --address=$RAY_HEAD_IP:6379 && tail -f /dev/null"
 ```

 全部启动完成后，在第一个节点上通过 docker exec 进入容器，查看 Ray 集群的状态：
@ -204,88 +185,44 @@ Demands:

 # RL训练

-## 单节点训练
-
-
-只有一个节点的情况下，执行如下命令即可启动训练：
-
-```bash
-docker exec -it r1-ray-head bash
-cd /storage/codes/AReaL
-mkdir /storage/ray/train_batch_logs/
-nohup bash ./examples/train_batch_1.5B_n1.sh &> /storage/ray/train_batch_logs/n1.log &
-```
-
-启动后，通过 `/storage/ray/train_batch_logs/n1.log` 日志文件查看训练的启动信息：
-
-```
-Log Dir: /storage/ray/train_batch_logs/ppo-zero-distill-1.5B-n1/20250222-104411
-Task Count: 1
-2025-02-22 10:44.11 Task 0 started: ppo-zero-distill-1.5B-n1 deepseek-ai__DeepSeek-R1-Distill-Qwen-1.5B prompts.jsonl 1024 8 1 actor_gen:d4p1m2,*:d4p2m1 16384 128 1 0.001
-```
-
-根据 Log Dir，可以查看当前运行的训练任务的具体日志，日志路径为 `{Log Dir}/{任务编号}.log`。比如 `/storage/ray/train_batch_logs/ppo-zero-distill-1.5B-n1/20250222-104411/0.log`：
-
-```
-20250222-10:44:15.581 quickstart INFO: Running ppo-math experiment.
-20250222-10:44:15.581 quickstart INFO: Logs will be dumped to /storage/ray/experiments/logs/root/ppo-zero-distill-1.5B-n1/1024x8-n1
-20250222-10:44:15.581 quickstart INFO: Model checkpoints will be saved to /storage/ray/experiments/checkpoints/root/ppo-zero-distill-1.5B-n1/1024x8-n1
-20250222-10:44:17.100 quickstart INFO: Launching experiments with RAY...
-```
-
-如果运行过程中出现错误（比如出现 Error 关键字），请参考Troubleshooting解决。
-
-## 分布式训练
-
 在进行分布式训练之前，请确保已经启动了 Ray 集群，并且集群状态正常。
 然后在第一个节点（Ray Head 所在节点），进入容器：

 ```
 docker exec -it r1-ray-head bash
 cd /storage/codes/AReaL
-mkdir /storage/ray/train_batch_logs/
 ```

-选择匹配硬件环境的一个任务运行即可：
+选择匹配硬件环境的一个配置运行即可：

 ```bash
-# 对应 1.5B 模型 4 节点，日志文件名为 n4.log
-nohup bash ./examples/train_batch_1.5B_n4.sh &> /storage/ray/train_batch_logs/n4.log &
-# 对应 1.5B 模型 16 节点，日志文件名为 n16.log
-nohup bash ./examples/train_batch_1.5B_n16.sh &> /storage/ray/train_batch_logs/n16.log &
-# 对应 7B 模型 4 节点，日志文件名为 7n4.log
-nohup bash ./examples/train_batch_7B_n4.sh &> /storage/ray/train_batch_logs/7n4.log &
-# 对应 7B 模型 16 节点，日志文件名为 7n16.log
-nohup bash ./examples/train_batch_7B_n16.sh &> /storage/ray/train_batch_logs/7n16.log &
+python3 -m realhf.apps.quickstart ppo-math --config ./examples/configs/7B-distill/ppo-7B-distill-gpus-128.yaml
 ```

-启动后，通过 `/storage/ray/train_batch_logs/{对应的日志文件名}.log` 日志文件查看训练的启动信息（以 `7n16.log` 为例）：
-
-```
-Log Dir: /storage/ray/train_batch_logs/ppo-zero-distill-7B-n16/20250222-102631
-Task Count: 1
-2025-02-22 10:26.31 Task 0 started: ppo-zero-distill-7B-n16 deepseek-ai__DeepSeek-R1-Distill-Qwen-7B prompts_7b_progress_20k.jsonl 1024 16 16 vllm.d16p1m4+d32p2m1 16384 128 4 0.01
+启动后，在终端可以看到启动日志：
 ```
+              ╭─────────────────────────────────────────────────╮               
+              │ Setting PPOMATHConfig with the Following Values │               
+              ╰─────────────────────────────────────────────────╯               

-根据 Log Dir，可以查看当前运行的训练任务的具体日志，日志路径为 `{Log Dir}/{任务编号}.log`。比如 `/storage/ray/train_batch_logs/ppo-zero-distill-7B-n16/20250222-102631/0.log`：
-
-```
+───────────────────────── Current Configuration Begin ──────────────────────────
+actor (ModelTrainEvalConfig)
+    actor.type (ModelFamily)
+        actor.type._class (str) - qwen2
+        actor.type.size (int) - 7
+        actor.type.is_critic (bool) - False
+...
+────────────────────────── Current Configuration End ───────────────────────────
+ 
 20250222-10:26:34.877 quickstart INFO: Running ppo-math experiment.
-20250222-10:26:34.877 quickstart INFO: Logs will be dumped to /storage/ray/experiments/logs/root/ppo-zero-distill-7B-n16/1024x16-n16
-20250222-10:26:34.877 quickstart INFO: Model checkpoints will be saved to /storage/ray/experiments/checkpoints/root/ppo-zero-distill-7B-n16/1024x16-n16
+20250222-10:44:15.581 quickstart INFO: Logs will be dumped to /storage/ray/experiments/logs/root/ppo-7B-distill-gpus-128/512x16
+20250222-10:44:15.581 quickstart INFO: Model checkpoints will be saved to /storage/ray/experiments/checkpoints/root/ppo-7B-distill-gpus-128/512x16
 20250222-10:26:36.408 quickstart INFO: Launching experiments with RAY...
 ```

 如果运行过程中出现错误（比如出现 Error 关键字），请参考Troubleshooting解决。

 ## Commandline Options
-`./examples/train_batch_{1.5/7}B_n{1/4/16}.sh` 脚本包含了预先配置好的训练参数，这些脚本最终都是通过以下命令启动训练的：
-
-```bash
-python3 -m realhf.apps.quickstart ppo-math option1=arg1 option2=arg2 ...
-```
-
-其中`option1=arg1`这些命令行参数是通过[hydra](https://hydra.cc/)进行解析的，其中每一条配置项都是python代码中的`dataclasses.dataclass`。用以下命令可以查看实验中所有可以传递的命令行参数：

 ```bash
 python3 -m realhf.apps.quickstart ppo-math --help
@ -293,16 +230,15 @@ python3 -m realhf.apps.quickstart ppo-math --help

 其中重要的参数的说明如下：

-+ MODE：总是为 ray，参考本教程进行训练时不要改成其他值。
-+ BASE_MODEL_PATH：模型的路径
-+ DATA_PATH：数据集 jsonl 文件的路径
-+ CLUSTER_SPEC_PATH：设置成 cluster_config.json 的路径
+ mode：总是为 ray，参考本教程进行训练时不要改成其他值。
+ {actor|critic|ref}.path：模型的路径
+ dataset.path：数据集 jsonl 文件的路径
+ external_configs.cluster_config：设置 cluster_config 的配置，比如 fileroot 是存放训练输出的根目录。

 + n_nodes：节点数量
-+ n_gpus_per_node：每个节点的GPU数量
-+ allocation_mode：实验中模型的GPU分配和3D并行策略，推荐的策略主要有以下两种形式:
-    + `actor_gen:d${DP1}p${TP1}m{PP1},*:d{DP2}p{PP2}m{MP2}`: 分别配置生成和推理的并行策略，训练和推理共用所有GPU，可以采用不同的并行策略。两种策略中三个整数相乘均需要等于GPU总量，即DP1xTP1xPP1=DP2xPP2xMP2=#GPU。这种情况下如果希望使用vLLM加速生成，需要设置`actor.vllm.hybrid_train=True`和`actor.vllm.enforce_eager=True`,且PP1必须是1（vLLM推理暂时不支持PP）。
-	+ `vllm.d${DP1}m${TP1}p${PP1}+d${DP2}m${TP2}p${PP2}`: 分别配置vLLM生成和训练的并行策略，生成和训练分离，使用两部分不同的GPU。二者所用的GPU数量相加要等于总的 GPU 数量，即DP1xTP1xPP1+DP2xTP2xPP2=#GPUs。在这种配置下，必须设置`actor.vllm.hybrid_train=False`。可以设置`actor.vllm.enforce_eager=False`加速vLLM生成。使用vLLM时同样需要保证PP1=1。
+ n_gpus_per_node：每个节点的 GPU 数量
+ allocation_mode：实验中模型的 GPU 分配和 3D 并行策略，推荐的策略有以下形式:
+	+ `sglang.d${DP1}m${TP1}p${PP1}+d${DP2}m${TP2}p${PP2}`: 分别配置 SGLang 生成和训练的并行策略，生成和训练分离，使用两部分不同的 GPU。二者所用的GPU数量相加要等于总的 GPU 数量，即 DP1xTP1xPP1+DP2xTP2xPP2=#GPUs。

 + exp_ctrl.total_train_epochs：训练的 epoch 数量（即迭代整个数据集的次数）
 + exp_ctrl.save_freq_{epochs|steps|secs}：保存持久化存储模型参数的频率，如果设成 null 会不保存模型
@ -323,7 +259,6 @@ python3 -m realhf.apps.quickstart ppo-math --help
 搜索日志中的 Epoch 关键字，查看总的 Epoch 数量和 Step 数量：

 ```bash
-# grep "Epoch" /storage/ray/train_batch_logs/ppo-zero-distill-7B-n16/20250222-102631/0.log
 (master_worker/0 pid=96390, ip=xxx.xxx.xxx.xxx) 20250222-11:11:56.997 master worker INFO: Epoch 1/1 step 1/19 (global step 1) finishes. Average #tokens per batch is 111847. #End to end# execution time: *2124.429*s. Total time consumption: 2283.862s. 
 (master_worker/0 pid=96390, ip=xxx.xxx.xxx.xxx) 20250222-11:52:02.719 master worker INFO: Epoch 1/1 step 2/19 (global step 2) finishes. Average #tokens per batch is 111847. #End to end# execution time: *2405.716*s. Total time consumption: 4689.584s. 
 (master_worker/0 pid=96390, ip=xxx.xxx.xxx.xxx) 20250222-12:27:25.084 master worker INFO: Epoch 1/1 step 3/19 (global step 3) finishes. Average #tokens per batch is 111847. #End to end# execution time: *2122.318*s. Total time consumption: 6811.949s. Estimated remaining time: 33957.093s. 
@ -346,7 +281,6 @@ python3 -m realhf.apps.quickstart ppo-math --help
 搜索日志中的 `task_reward` 关键字

 ```bash
-# grep "task_reward" /storage/ray/train_batch_logs/ppo-zero-distill-7B-n16/20250222-102631/0.log
 (master_worker/0 pid=96390, ip=xxx.xxx.xxx.xxx) 20250222-11:11:56.991 master worker INFO: RPC name actor_train returns {'ppo_approx_kl': -2.2640759198111482e-05, 'actor_loss': 1.1128166761409375e-06, 'actor_clip_ratio': 2.1122002635820536e-07, 'importance_weight': 1.0000014305114746, 'task_reward': -0.2996826171875, 'kl_reward': -2.27004832709099e-07, 'final_reward': -0.30145370960235596, 'advantage': 0.003593671601265669, 'avg_seq_len': 7907.8955078125, 'avg_prompt_len': 105.845703125, 'n_tokens': 127828786.0, 'n_valid_tokens': 127828786.0, 'n_seqs': 16384.0, 'no_eos_ratio': 0.122802734375, 'disable_value': 1.0, 'mask_no_eos_with_zero': 0.0}
 (master_worker/0 pid=96390, ip=xxx.xxx.xxx.xxx) 20250222-11:52:02.712 master worker INFO: RPC name actor_train returns {'ppo_approx_kl': -2.493159263394773e-05, 'actor_loss': -3.846728588996484e-07, 'actor_clip_ratio': 3.16789424914532e-07, 'importance_weight': 0.9999996423721313, 'task_reward': -0.6793212890625, 'kl_reward': -2.536311853873485e-07, 'final_reward': -0.6813737154006958, 'advantage': 0.004844569601118565, 'avg_seq_len': 8203.9453125, 'avg_prompt_len': 111.892578125, 'n_tokens': 132580185.0, 'n_valid_tokens': 132580185.0, 'n_seqs': 16384.0, 'no_eos_ratio': 0.13812255859375, 'disable_value': 1.0, 'mask_no_eos_with_zero': 0.0}
 (master_worker/0 pid=96390, ip=xxx.xxx.xxx.xxx) 20250222-12:27:25.077 master worker INFO: RPC name actor_train returns {'ppo_approx_kl': -2.572356243035756e-05, 'actor_loss': -5.036404786551429e-07, 'actor_clip_ratio': 1.8960582792715286e-07, 'importance_weight': 0.9999992251396179, 'task_reward': -0.6280517578125, 'kl_reward': -2.988609537624143e-07, 'final_reward': -0.6303607225418091, 'advantage': 0.004505862481892109, 'avg_seq_len': 7834.6328125, 'avg_prompt_len': 108.900390625, 'n_tokens': 126578395.0, 'n_valid_tokens': 126578395.0, 'n_seqs': 16384.0, 'no_eos_ratio': 0.11761474609375, 'disable_value': 1.0, 'mask_no_eos_with_zero': 0.0}
@ -371,7 +305,7 @@ python3 -m realhf.apps.quickstart ppo-math --help

 启动一个新的容器用于运行评估脚本（评估需要更新部分 python 库，请不要在训练容器中进行）：
 ```
-docker run -d --name r1-eval --privileged --gpus all --network host --shm-size 700g -v /storage:/storage ghcr.io/inclusionai/areal-runtime:v0.1.0 /bin/bash -c "tail -f /dev/null"
+docker run -d --name r1-eval --privileged --gpus all --network host --shm-size 700g -v /storage:/storage ghcr.io/inclusionai/areal-runtime:v0.2.0 /bin/bash -c "tail -f /dev/null"
 docker exec -it r1-eval bash
 ```

@ -434,43 +368,22 @@ nohup python eval_and_aggregate.py \

 如果以下内容没有解答你的问题，欢迎在 GitHub Issue 中进行提问。

-## 自动重启
+## 自动恢复

-### How to
+当设置了 `recover_mode=auto` 并且训练配置和之前相同，AReaL 会尝试找到之前生成的 checkpoints 并且从这个 checkpoints 恢复训练。

-训练都是通过 `./examples/train_batch_{1.5/7}B_n{1/4/16}.sh` 脚本启动的，脚本中存在如下格式的 1 行启动参数，`train_batch` 脚本在执行完该组参数后自动停止：
+如果自动恢复失败，有这些可能性：

-```bash
-ALL_PARAMS=(
-    "${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 16 ${NODES} ${ALLOCATION_MODE} 16384 128 4 0.01"
-)
-```
-OOM 或硬件故障都会导致训练终止，这种情况下可以手动重新执行一次 `train_batch` 脚本，会自动从上次训练的 recover checkpoint 处继续训练。
-
-如果频繁遇到故障，需要手动重启的情况时，可以修改`train_batch`脚本，设置多组相同的参数，让脚本自动重跑。比如我希望这组训练参数可以重跑3次，那么参数设置为完全相同的3组即可，如下所示：
-```bash
-ALL_PARAMS=(
-    "${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 16 ${NODES} ${ALLOCATION_MODE} 16384 128 4 0.01"
-    "${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 16 ${NODES} ${ALLOCATION_MODE} 16384 128 4 0.01"
-    "${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 16 ${NODES} ${ALLOCATION_MODE} 16384 128 4 0.01"
-)
-```
-
-### 为什么训练任务重启后没有在上次的 Step 之后继续而是从头开始训练了
-
-有以下可能性，请检查：
-
-+ 训练脚本里的 EXP_NAME 和TRIAL_NAME与之前的不一样
-+ Batch Size（参数里的 1024），Group Size（参数里的 16），节点数（参数里的 ${NODES}）三个值发生了变化
+ 训练配置里的 `experiment_name` 和 `trial_name` 与之前的不一样
+ Batch Size（参数里的 `dataset.train_bs_n_seqs`），Group Size（参数里的 `group_size`），节点数（参数里的 `n_nodes`）三个值发生了变化
 + 之前的训练没有创建过 recover checkpoint 。默认的 recover checkpoint 规则有 2 个：
 	+ 从第 2 个 step 完成后才生成 recover checkpoint
-	+ 一个 step 训练完成，且距离上次 recover checkpoint 时间超过 600s，则生成一个新的 recover checkpoint。这个参数在 `examples/train_{tiny|small|large}_on_ray.sh` 脚本里，参数名为 ：`exp_ctrl.ckpt_freq_secs=600`。
+	+ 一个 step 训练完成，且距离上次 recover checkpoint 时间超过 600s，则生成一个新的 recover checkpoint。这个参数在 `./examples/configs/*/*.yaml` 文件里，参数名为 ：`exp_ctrl.ckpt_freq_secs=600`。


-可以通过搜索 Dumped recover 确认是否生成过 recover checkpoint
+可以通过搜索 `Dumped recover` 确认是否生成过 recover checkpoint

 ```bash
-# grep "Dumped recover" /storage/ray/train_batch_logs/ppo-zero-distill-7B-n16/20250222-102631/0.log
 (master_worker/0 pid=96390, ip=xxx.xxx.xxx.xxx) 20250222-11:52:02.760 master worker INFO: Dumped recover info to file.
 (master_worker/0 pid=96390, ip=xxx.xxx.xxx.xxx) 20250222-12:27:25.105 master worker INFO: Dumped recover info to file.
 (master_worker/0 pid=96390, ip=xxx.xxx.xxx.xxx) 20250222-13:05:58.264 master worker INFO: Dumped recover info to file.
--- a/examples/cluster_config.json
+++ b/examples/cluster_config.json
@ -1,15 +0,0 @@
-{
-    "cluster_type": "slurm",
-    "cluster_name": "my_cluster",
-    "fileroot": "/storage/openpsi/experiments",
-    "default_mount": "/storage:/storage",
-    "node_type_from_node_name": {
-        "slurmd-\\d+$": ""
-    },
-    "gpu_type_from_node_name": {
-        "slurmd-\\d+$": "tesla"
-    },
-    "cpu_image": "/storage/images/real-gpu.sif",
-    "gpu_image": "/storage/images/real-gpu.sif",
-    "node_name_prefix": "slurmd-"
-}
--- a/examples/cluster_config_on_ray.json
+++ b/examples/cluster_config_on_ray.json
@ -1,7 +0,0 @@
-{
-    "cluster_type": "ray",
-    "cluster_name": "ray_cluster",
-    "fileroot": "/storage/ray/experiments",
-    "default_mount": "/storage:/storage",
-    "n_gpus_per_node": 8
-}
--- a/examples/configs/1.5B-distill/ppo-1.5B-distill-gpus-128.yaml
+++ b/examples/configs/1.5B-distill/ppo-1.5B-distill-gpus-128.yaml
@ -0,0 +1,86 @@
+experiment_name: ppo-1.5B-distill-gpus-128
+trial_name: 512x16
+mode: ray
+wandb:
+  mode: disabled
+recover_mode: auto
+recover_retries: 10
+allocation_mode: 'sglang.d64p1m1+d32p2m1'
+n_nodes: 16
+n_gpus_per_node: 8
+cache_clear_freq: 1
+exp_ctrl:
+  total_train_epochs: 5
+  save_freq_epochs: 1
+  ckpt_freq_secs: 600
+torch_cache_mysophobia: true
+actor:
+  type:
+    _class: qwen2
+  path: '/storage/models/DeepSeek-R1-Distill-Qwen-1.5B'
+  optimizer:
+    lr: 2e-05
+    lr_scheduler_type: constant
+    eps: 1e-5
+    warmup_steps_proportion: 0.001
+    hysteresis: 2
+  sglang:
+    mem_fraction_static: 0.8
+    triton_attention_num_kv_splits: 16
+critic:
+  type:
+    _class: qwen2
+    is_critic: true
+  path: '/storage/models/DeepSeek-R1-Distill-Qwen-1.5B'
+  init_critic_from_actor: true
+ref:
+  type:
+    _class: qwen2
+  path: '/storage/models/DeepSeek-R1-Distill-Qwen-1.5B'
+actor_train:
+  mb_spec:
+    max_tokens_per_mb: 30720
+critic_train:
+  mb_spec:
+    max_tokens_per_mb: 30720
+actor_gen:
+  mb_spec:
+    max_tokens_per_mb: 30720
+critic_inf:
+  mb_spec:
+    max_tokens_per_mb: 30720
+actor_inf:
+  mb_spec:
+    max_tokens_per_mb: 30720
+ref_inf:
+  mb_spec:
+    max_tokens_per_mb: 30720
+dataset:
+  path: '/storage/datasets/boba_106k_0319.jsonl'
+  max_prompt_len: 1024
+  train_bs_n_seqs: 512
+ppo:
+  gen:
+    max_new_tokens: 27648
+    min_new_tokens: 0
+    top_p: 1.0
+    top_k: 1000000
+    temperature: 1.0
+    force_no_logits_mask: True
+    use_cuda_graph: True
+  ppo_n_minibatches: 4
+  kl_ctl: 0.0
+  discount: 1.0
+  value_eps_clip: 0.2
+  disable_value: true
+  reward_output_scaling: 5
+  reward_output_bias: 0.0
+  adv_norm: true
+  value_norm: true
+group_size: 16
+group_adv_norm: false
+external_configs:
+  cluster_config:
+    fileroot: "/storage/ray/experiments"
+  envs:
+    REAL_GPU_MEMORY_KILL_THRESHOLD: "1"
--- a/examples/configs/1.5B-distill/ppo-1.5B-distill-gpus-32.yaml
+++ b/examples/configs/1.5B-distill/ppo-1.5B-distill-gpus-32.yaml
@ -0,0 +1,86 @@
+experiment_name: ppo-1.5B-distill-gpus-32
+trial_name: 512x16
+mode: ray
+wandb:
+  mode: disabled
+recover_mode: auto
+recover_retries: 10
+allocation_mode: 'sglang.d16p1m1+d8p2m1'
+n_nodes: 4
+n_gpus_per_node: 8
+cache_clear_freq: 1
+exp_ctrl:
+  total_train_epochs: 5
+  save_freq_epochs: 1
+  ckpt_freq_secs: 600
+torch_cache_mysophobia: true
+actor:
+  type:
+    _class: qwen2
+  path: '/storage/models/DeepSeek-R1-Distill-Qwen-1.5B'
+  optimizer:
+    lr: 2e-05
+    lr_scheduler_type: constant
+    eps: 1e-5
+    warmup_steps_proportion: 0.001
+    hysteresis: 2
+  sglang:
+    mem_fraction_static: 0.8
+    triton_attention_num_kv_splits: 16
+critic:
+  type:
+    _class: qwen2
+    is_critic: true
+  path: '/storage/models/DeepSeek-R1-Distill-Qwen-1.5B'
+  init_critic_from_actor: true
+ref:
+  type:
+    _class: qwen2
+  path: '/storage/models/DeepSeek-R1-Distill-Qwen-1.5B'
+actor_train:
+  mb_spec:
+    max_tokens_per_mb: 30720
+critic_train:
+  mb_spec:
+    max_tokens_per_mb: 30720
+actor_gen:
+  mb_spec:
+    max_tokens_per_mb: 30720
+critic_inf:
+  mb_spec:
+    max_tokens_per_mb: 30720
+actor_inf:
+  mb_spec:
+    max_tokens_per_mb: 30720
+ref_inf:
+  mb_spec:
+    max_tokens_per_mb: 30720
+dataset:
+  path: '/storage/datasets/boba_106k_0319.jsonl'
+  max_prompt_len: 1024
+  train_bs_n_seqs: 512
+ppo:
+  gen:
+    max_new_tokens: 27648
+    min_new_tokens: 0
+    top_p: 1.0
+    top_k: 1000000
+    temperature: 1.0
+    force_no_logits_mask: True
+    use_cuda_graph: True
+  ppo_n_minibatches: 4
+  kl_ctl: 0.0
+  discount: 1.0
+  value_eps_clip: 0.2
+  disable_value: true
+  reward_output_scaling: 5
+  reward_output_bias: 0.0
+  adv_norm: true
+  value_norm: true
+group_size: 16
+group_adv_norm: false
+external_configs:
+  cluster_config:
+    fileroot: "/storage/ray/experiments"
+  envs:
+    REAL_GPU_MEMORY_KILL_THRESHOLD: "1"
--- a/examples/configs/1.5B-distill/areal-1.5B-distill-gpus-8.yaml
+++ b/examples/configs/1.5B-distill/areal-1.5B-distill-gpus-8.yaml
@ -1,16 +1,16 @@
-experiment_name: areal-1.5B-distill-gpus-8
-trial_name: 1024x8
+experiment_name: ppo-1.5B-distill-gpus-8
+trial_name: 512x16
 mode: ray
 wandb:
  mode: disabled
 recover_mode: auto
 recover_retries: 10
-allocation_mode: 'sglang.d4p1m1+d4p1m1'
+allocation_mode: 'sglang.d4p1m1+d2p2m1'
 n_nodes: 1
 n_gpus_per_node: 8
 cache_clear_freq: 1
 exp_ctrl:
-  total_train_epochs: 10
+  total_train_epochs: 5
  save_freq_epochs: 1
  ckpt_freq_secs: 600
 torch_cache_mysophobia: true
@ -19,16 +19,14 @@ actor:
    _class: qwen2
  path: '/storage/models/DeepSeek-R1-Distill-Qwen-1.5B'
  optimizer:
-    lr: 1.0e-05
+    lr: 2e-05
    lr_scheduler_type: constant
-    initial_loss_scale: 262144.0
-    loss_scale_window: 5.0
+    eps: 1e-5
+    warmup_steps_proportion: 0.001
    hysteresis: 2
  sglang:
-    disable_radix_cache: true
-    context_length: 18432
    mem_fraction_static: 0.8
-    max_running_requests: 128
+    triton_attention_num_kv_splits: 16
 critic:
  type:
    _class: qwen2
@ -41,43 +39,45 @@ ref:
  path: '/storage/models/DeepSeek-R1-Distill-Qwen-1.5B'
 actor_train:
  mb_spec:
-    max_tokens_per_mb: 19456
+    max_tokens_per_mb: 30720
 critic_train:
  mb_spec:
-    max_tokens_per_mb: 19456
+    max_tokens_per_mb: 30720
 actor_gen:
  mb_spec:
-    max_tokens_per_mb: 19456
+    max_tokens_per_mb: 30720
 critic_inf:
  mb_spec:
-    max_tokens_per_mb: 19456
+    max_tokens_per_mb: 30720
 actor_inf:
  mb_spec:
-    max_tokens_per_mb: 19456
+    max_tokens_per_mb: 30720
 ref_inf:
  mb_spec:
-    max_tokens_per_mb: 19456
+    max_tokens_per_mb: 30720
 dataset:
-  path: '/storage/datasets/prompts_for_r1_distilled_0319.jsonl'
-  max_prompt_len: 2048
-  train_bs_n_seqs: 1024
+  path: '/storage/datasets/boba_106k_0319.jsonl'
+  max_prompt_len: 1024
+  train_bs_n_seqs: 512
 ppo:
  gen:
-    max_new_tokens: 16384
+    max_new_tokens: 27648
    min_new_tokens: 0
    top_p: 1.0
    top_k: 1000000
    temperature: 1.0
+    force_no_logits_mask: True
+    use_cuda_graph: True
  ppo_n_minibatches: 4
-  kl_ctl: 0.001
+  kl_ctl: 0.0
  discount: 1.0
  value_eps_clip: 0.2
  disable_value: true
-  reward_output_scaling: 5.0
+  reward_output_scaling: 5
  reward_output_bias: 0.0
  adv_norm: true
  value_norm: true
-group_size: 8
+group_size: 16
 group_adv_norm: false
 external_configs:
  cluster_config:
--- a/examples/configs/32B-distill/ppo-32B-distill-gpus-128.yaml
+++ b/examples/configs/32B-distill/ppo-32B-distill-gpus-128.yaml
@ -0,0 +1,86 @@
+experiment_name: ppo-32B-distill-gpus-128
+trial_name: 512x32
+mode: ray
+wandb:
+  mode: disabled
+recover_mode: auto
+recover_retries: 10
+allocation_mode: 'sglang.d8m8p1+d4p4m4'
+n_nodes: 16
+n_gpus_per_node: 8
+cache_clear_freq: 1
+exp_ctrl:
+  total_train_epochs: 5
+  save_freq_epochs: 1
+  ckpt_freq_secs: 600
+torch_cache_mysophobia: true
+actor:
+  type:
+    _class: qwen2
+  path: '/storage/models/DeepSeek-R1-Distill-Qwen-32B'
+  optimizer:
+    lr: 2e-05
+    lr_scheduler_type: constant
+    eps: 1e-5
+    warmup_steps_proportion: 0.001
+    hysteresis: 2
+  sglang:
+    mem_fraction_static: 0.8
+    triton_attention_num_kv_splits: 16
+critic:
+  type:
+    _class: qwen2
+    is_critic: true
+  path: '/storage/models/DeepSeek-R1-Distill-Qwen-32B'
+  init_critic_from_actor: true
+ref:
+  type:
+    _class: qwen2
+  path: '/storage/models/DeepSeek-R1-Distill-Qwen-32B'
+actor_train:
+  mb_spec:
+    max_tokens_per_mb: 30720
+critic_train:
+  mb_spec:
+    max_tokens_per_mb: 30720
+actor_gen:
+  mb_spec:
+    max_tokens_per_mb: 30720
+critic_inf:
+  mb_spec:
+    max_tokens_per_mb: 30720
+actor_inf:
+  mb_spec:
+    max_tokens_per_mb: 30720
+ref_inf:
+  mb_spec:
+    max_tokens_per_mb: 30720
+dataset:
+  path: '/storage/datasets/boba_106k_0319.jsonl'
+  max_prompt_len: 1024
+  train_bs_n_seqs: 512
+ppo:
+  gen:
+    max_new_tokens: 27648
+    min_new_tokens: 0
+    top_p: 1.0
+    top_k: 1000000
+    temperature: 1.0
+    force_no_logits_mask: True
+    use_cuda_graph: True
+  ppo_n_minibatches: 4
+  kl_ctl: 0.0
+  discount: 1.0
+  value_eps_clip: 0.2
+  disable_value: true
+  reward_output_scaling: 5
+  reward_output_bias: 0.0
+  adv_norm: true
+  value_norm: true
+group_size: 32
+group_adv_norm: false
+external_configs:
+  cluster_config:
+    fileroot: "/storage/ray/experiments"
+  envs:
+    REAL_GPU_MEMORY_KILL_THRESHOLD: "1"
--- a/examples/configs/7B-distill/ppo-7B-distill-gpus-128.yaml
+++ b/examples/configs/7B-distill/ppo-7B-distill-gpus-128.yaml
@ -0,0 +1,86 @@
+experiment_name: ppo-7B-distill-gpus-128
+trial_name: 512x16
+mode: ray
+wandb:
+  mode: disabled
+recover_mode: auto
+recover_retries: 10
+allocation_mode: 'sglang.d64p1m1+d32p2m1'
+n_nodes: 16
+n_gpus_per_node: 8
+cache_clear_freq: 1
+exp_ctrl:
+  total_train_epochs: 5
+  save_freq_epochs: 1
+  ckpt_freq_secs: 600
+torch_cache_mysophobia: true
+actor:
+  type:
+    _class: qwen2
+  path: '/storage/models/DeepSeek-R1-Distill-Qwen-7B'
+  optimizer:
+    lr: 2e-05
+    lr_scheduler_type: constant
+    eps: 1e-5
+    warmup_steps_proportion: 0.001
+    hysteresis: 2
+  sglang:
+    mem_fraction_static: 0.8
+    triton_attention_num_kv_splits: 16
+critic:
+  type:
+    _class: qwen2
+    is_critic: true
+  path: '/storage/models/DeepSeek-R1-Distill-Qwen-7B'
+  init_critic_from_actor: true
+ref:
+  type:
+    _class: qwen2
+  path: '/storage/models/DeepSeek-R1-Distill-Qwen-7B'
+actor_train:
+  mb_spec:
+    max_tokens_per_mb: 30720
+critic_train:
+  mb_spec:
+    max_tokens_per_mb: 30720
+actor_gen:
+  mb_spec:
+    max_tokens_per_mb: 30720
+critic_inf:
+  mb_spec:
+    max_tokens_per_mb: 30720
+actor_inf:
+  mb_spec:
+    max_tokens_per_mb: 30720
+ref_inf:
+  mb_spec:
+    max_tokens_per_mb: 30720
+dataset:
+  path: '/storage/datasets/boba_106k_0319.jsonl'
+  max_prompt_len: 1024
+  train_bs_n_seqs: 512
+ppo:
+  gen:
+    max_new_tokens: 27648
+    min_new_tokens: 0
+    top_p: 1.0
+    top_k: 1000000
+    temperature: 1.0
+    force_no_logits_mask: True
+    use_cuda_graph: True
+  ppo_n_minibatches: 4
+  kl_ctl: 0.0
+  discount: 1.0
+  value_eps_clip: 0.2
+  disable_value: true
+  reward_output_scaling: 5
+  reward_output_bias: 0.0
+  adv_norm: true
+  value_norm: true
+group_size: 16
+group_adv_norm: false
+external_configs:
+  cluster_config:
+    fileroot: "/storage/ray/experiments"
+  envs:
+    REAL_GPU_MEMORY_KILL_THRESHOLD: "1"
--- a/examples/configs/7B-distill/ppo-7B-distill-gpus-32.yaml
+++ b/examples/configs/7B-distill/ppo-7B-distill-gpus-32.yaml
@ -0,0 +1,86 @@
+experiment_name: ppo-7B-distill-gpus-32
+trial_name: 512x16
+mode: ray
+wandb:
+  mode: disabled
+recover_mode: auto
+recover_retries: 10
+allocation_mode: 'sglang.d16p1m1+d8p2m1'
+n_nodes: 4
+n_gpus_per_node: 8
+cache_clear_freq: 1
+exp_ctrl:
+  total_train_epochs: 5
+  save_freq_epochs: 1
+  ckpt_freq_secs: 600
+torch_cache_mysophobia: true
+actor:
+  type:
+    _class: qwen2
+  path: '/storage/models/DeepSeek-R1-Distill-Qwen-7B'
+  optimizer:
+    lr: 2e-05
+    lr_scheduler_type: constant
+    eps: 1e-5
+    warmup_steps_proportion: 0.001
+    hysteresis: 2
+  sglang:
+    mem_fraction_static: 0.8
+    triton_attention_num_kv_splits: 16
+critic:
+  type:
+    _class: qwen2
+    is_critic: true
+  path: '/storage/models/DeepSeek-R1-Distill-Qwen-7B'
+  init_critic_from_actor: true
+ref:
+  type:
+    _class: qwen2
+  path: '/storage/models/DeepSeek-R1-Distill-Qwen-7B'
+actor_train:
+  mb_spec:
+    max_tokens_per_mb: 30720
+critic_train:
+  mb_spec:
+    max_tokens_per_mb: 30720
+actor_gen:
+  mb_spec:
+    max_tokens_per_mb: 30720
+critic_inf:
+  mb_spec:
+    max_tokens_per_mb: 30720
+actor_inf:
+  mb_spec:
+    max_tokens_per_mb: 30720
+ref_inf:
+  mb_spec:
+    max_tokens_per_mb: 30720
+dataset:
+  path: '/storage/datasets/boba_106k_0319.jsonl'
+  max_prompt_len: 1024
+  train_bs_n_seqs: 512
+ppo:
+  gen:
+    max_new_tokens: 27648
+    min_new_tokens: 0
+    top_p: 1.0
+    top_k: 1000000
+    temperature: 1.0
+    force_no_logits_mask: True
+    use_cuda_graph: True
+  ppo_n_minibatches: 4
+  kl_ctl: 0.0
+  discount: 1.0
+  value_eps_clip: 0.2
+  disable_value: true
+  reward_output_scaling: 5
+  reward_output_bias: 0.0
+  adv_norm: true
+  value_norm: true
+group_size: 16
+group_adv_norm: false
+external_configs:
+  cluster_config:
+    fileroot: "/storage/ray/experiments"
+  envs:
+    REAL_GPU_MEMORY_KILL_THRESHOLD: "1"
--- a/examples/scripts/distributed_slurm/sft.sh
+++ b/examples/scripts/distributed_slurm/sft.sh
@ -1,48 +0,0 @@
-# MODEL_FAMILY specifies how the pretrained checkpoint is loaded, e.g., as a LLaMA model or a GPT model.
-MODEL_FAMILY=qwen2
-
-# PRETRAINED_PATH is the HuggingFace checkpoint.
-PRETRAINED_PATH=/storage/openpsi/models/Qwen__Qwen2.5-7B-Instruct
-TRAIN_DATA_PATH=/storage/openpsi/data/ppu_test_data/examples_test_data/sft_pos-train.jsonl
-VALID_DATA_PATH=/storage/openpsi/data/ppu_test_data/examples_test_data/sft_pos-train.jsonl
-
-# Option 1: The experiment runs locally with subprocesses.
-# MODE=local
-# Option 2: The experiment runs in a Ray cluster
-# MODE=ray
-# Option 3: The experiment runs in a SLURM + pyxis cluster
-# Using the slurm mode requires a cluster spec file
-# and setting CLUSTER_SPEC_PATH to the path of it.
-MODE=slurm
-
-# `experiment_name` and `trial_name` can be arbitrary.
-# Logs and saved checkpoints will be indexed by them.
-EXP_NAME=quickstart-sft
-TRIAL_NAME=$MODEL_FAMILY-$MODE-run1
-
-# We use the "manual" allocation mode here to manually specify the parallelism strategy,
-# which is pipeline=2, tensor-model=2, and data=2, using in total of 8 GPUs.
-
-# The `sft` subcommand specifies that this is a supervised fine-tuning experiment.
-export CLUSTER_SPEC_PATH="/storage/realhf/examples/cluster_config.json"
-python3 -m realhf.apps.quickstart sft \
-    mode=$MODE \
-    experiment_name=$EXP_NAME \
-    trial_name=$TRIAL_NAME \
-    exp_ctrl.total_train_epochs=8 \
-    exp_ctrl.save_freq_steps=50 \
-    exp_ctrl.eval_freq_epochs=1 \
-    model.optimizer.type=adam \
-    model.optimizer.lr_scheduler_type=cosine \
-    model.optimizer.lr=1e-5 \
-    model.optimizer.warmup_steps_proportion=0.02 \
-    model.type._class=$MODEL_FAMILY \
-    model.path=$PRETRAINED_PATH \
-    dataset.train_path=${TRAIN_DATA_PATH} \
-    dataset.valid_path=${VALID_DATA_PATH} \
-    dataset.max_seqlen=1024 \
-    dataset.train_bs_n_seqs=512 \
-    dataset.valid_bs_n_seqs=512 \
-    allocation_mode=d4m4p2 \
-    n_nodes=4 n_gpus_per_node=8 \
-    allocation.mb_spec.n_mbs=2
--- a/examples/train_1.5B_n16_on_ray.sh
+++ b/examples/train_1.5B_n16_on_ray.sh
@ -1,122 +0,0 @@
-#!/bin/sh
-MODEL_FAMILY=qwen2
-
-EXP_NAME="ds-r1-distill-qwen-1.5b-16nodes"
-TRAIN_BATCH_SIZE="1024"
-GROUP_SIZE="8"
-NODES="16"
-ALLOCATION_MODE="vllm.d64p1m1+d32p2m1"
-MAX_NEW_TOKENS=$3
-MAX_NUM_SEQS=128
-PPO_MBS=4
-KL_CTL=0.001
-
-MAX_TOKEN_PER_MB=$(expr 2048 + ${MAX_NEW_TOKENS} + 1024)
-MAX_SEQ_LEN_TO_CAPTURE=$(expr 2048 + ${MAX_NEW_TOKENS})
-
-BASE_MODEL_PATH="$1"
-
-# original data
-DATA_PATH="$2"
-
-# Option 1: The experiment runs locally with subprocesses.
-# MODE=local
-# Option 2: The experiment runs in a Ray cluster
-# MODE=ray
-# Option 3: The experiment runs in a SLURM + pyxis cluster
-# Using the slurm mode requires a cluster spec file
-# and setting CLUSTER_SPEC_PATH to the path of it.
-MODE=ray
-
-# `experiment_name` and `trial_name` can be arbitrary.
-# Logs and saved checkpoints will be indexed by them.
-#EXP_NAME=ppo-zero--${MODEL_NAME}--${DATASET_NAME}
-#EXP_NAME=ppo-zero-distill-1.5B-default
-TRIAL_NAME="${TRAIN_BATCH_SIZE}x${GROUP_SIZE}-n${NODES}"
-
-# We use the "heuristic" allocation mode here to automatically determine the parallelism strategy
-# for each model function call, i.e., actor generation, critic inference, actor train, etc.
-# The number of GPUs is `n_nodes` * `n_gpus_per_node` (not set explictly here, defaults to 8).
-# ReaL will make full use of these available GPUs to design allocations.
-# This does not ensure the optimal throughput, but it is a good starting point.
-
-# The `heuristic` allocation mode is not ensured to run with every model configurations.
-# For example, if the vocabulary size is an odd number, the model parallelism may not work.
-# In these cases, you can use the `ppo_manual.sh` to specify the parallelism strategy manually.
-
-# The `ppo` subcommand specifies that this is a PPO experiment.
-# The `save_freq_steps` is set to `null` to disable saving checkpoints.
-# Enable it if you want to save checkpoints.
-# The `ppo` option is used to control the generation and PPO algorithm hyperparameters.
-# Note that the performance of PPO is sensitive to the the pre-trained model and hyperparameters.
-# It's the user's responsibility to tune them appropriately.
-unset CLUSTER_SPEC_PATH
-CLUSTER_SPEC_PATH=/storage/ray/cluster_config_on_ray.json \
-REAL_GPU_MEMORY_KILL_THRESHOLD=1 \
-python3 -m realhf.apps.quickstart ppo-math \
-    mode=$MODE \
-    experiment_name=$EXP_NAME \
-    trial_name=$TRIAL_NAME \
-    wandb.mode=disabled \
-    exp_ctrl.total_train_epochs=10 \
-    exp_ctrl.save_freq_epochs=1 \
-    exp_ctrl.ckpt_freq_secs=600 \
-    group_size=${GROUP_SIZE} \
-    group_adv_norm=False \
-    use_dense_reward=False \
-    reward_delta=True \
-    rw_type=sparse \
-    check_xml_format=False \
-    actor.type._class=$MODEL_FAMILY \
-    actor.path=$BASE_MODEL_PATH \
-    actor.vllm.hybrid_train=False \
-    actor.vllm.enforce_eager=False \
-    actor.vllm.max_seq_len_to_capture=${MAX_SEQ_LEN_TO_CAPTURE} \
-    actor.vllm.max_num_seqs=${MAX_NUM_SEQS} \
-    actor.vllm.gpu_memory_utilization=1 \
-    actor.vllm.swap_space=64 \
-    critic.type._class=$MODEL_FAMILY \
-    critic.type.is_critic=True \
-    critic.init_critic_from_actor=True \
-    critic.path=$BASE_MODEL_PATH\
-    ref.type._class=$MODEL_FAMILY \
-    ref.path=$BASE_MODEL_PATH \
-    rew.type._class=$MODEL_FAMILY \
-    rew.type.is_critic=True \
-    rew.init_critic_from_actor=True \
-    rew.path=$BASE_MODEL_PATH \
-    dataset.path=$DATA_PATH \
-    dataset.max_prompt_len=2048 \
-    dataset.train_bs_n_seqs=${TRAIN_BATCH_SIZE} \
-    ppo.gen.max_new_tokens=${MAX_NEW_TOKENS} \
-    ppo.gen.min_new_tokens=0 \
-    ppo.disable_value=True \
-    ppo.gen.top_p=1 ppo.gen.top_k=1000000 \
-    ppo.gen.use_cuda_graph=True \
-    ppo.gen.force_no_logits_mask=True \
-    ppo.ppo_n_minibatches=${PPO_MBS} \
-    ppo.gen.temperature=0.6 \
-    ppo.kl_ctl=${KL_CTL} \
-    ppo.value_eps_clip=0.2 \
-    ppo.reward_output_scaling=5 \
-    ppo.reward_output_bias=0.0 \
-    ppo.adv_norm=True ppo.value_norm=True \
-    mask_too_long=False \
-    ppo.discount=1.0 \
-    actor.optimizer.lr=1e-5 \
-    actor.optimizer.lr_scheduler_type=constant \
-    actor.optimizer.initial_loss_scale=262144.0 \
-    actor.optimizer.loss_scale_window=5 \
-    actor.optimizer.hysteresis=2 \
-    actor_gen.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    ref_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    rew_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    critic_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    actor_train.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    critic_train.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    cache_clear_freq=1 \
-    n_nodes=${NODES} \
-    allocation_mode="'${ALLOCATION_MODE}'" n_gpus_per_node=8 \
-    recover_mode=auto \
-    recover_retries=10 \
-    torch_cache_mysophobia=True
--- a/examples/train_7B_zero_n16_on_ray.sh
+++ b/examples/train_7B_zero_n16_on_ray.sh
@ -1,119 +0,0 @@
-#!/bin/sh
-
-EXP_NAME="ds-r1-distill-qwen-7b-zero-16nodes"
-TRAIN_BATCH_SIZE="512"
-GROUP_SIZE="64"
-NODES="16"
-ALLOCATION_MODE="vllm.d16p1m4+d32p2m1"
-MAX_NEW_TOKENS=$3
-MAX_NUM_SEQS=128
-PPO_MBS=4
-KL_CTL=0.0
-
-MAX_TOKEN_PER_MB=$(expr 2048 + ${MAX_NEW_TOKENS} + 1024)
-MAX_SEQ_LEN_TO_CAPTURE=$(expr 2048 + ${MAX_NEW_TOKENS})
-
-BASE_MODEL_PATH="$1"
-
-# original data
-DATA_PATH="$2"
-
-# Option 1: The experiment runs locally with subprocesses.
-# MODE=local
-# Option 2: The experiment runs in a Ray cluster
-# MODE=ray
-# Option 3: The experiment runs in a SLURM + pyxis cluster
-# Using the slurm mode requires a cluster spec file
-# and setting CLUSTER_SPEC_PATH to the path of it.
-MODE=ray
-
-# `experiment_name` and `trial_name` can be arbitrary.
-# Logs and saved checkpoints will be indexed by them.
-#EXP_NAME=ppo-zero--${MODEL_NAME}--${DATASET_NAME}
-#EXP_NAME=ppo-zero-distill-1.5B-default
-TRIAL_NAME="${TRAIN_BATCH_SIZE}x${GROUP_SIZE}-n${NODES}"
-
-# We use the "heuristic" allocation mode here to automatically determine the parallelism strategy
-# for each model function call, i.e., actor generation, critic inference, actor train, etc.
-# The number of GPUs is `n_nodes` * `n_gpus_per_node` (not set explictly here, defaults to 8).
-# ReaL will make full use of these available GPUs to design allocations.
-# This does not ensure the optimal throughput, but it is a good starting point.
-
-# The `heuristic` allocation mode is not ensured to run with every model configurations.
-# For example, if the vocabulary size is an odd number, the model parallelism may not work.
-# In these cases, you can use the `ppo_manual.sh` to specify the parallelism strategy manually.
-
-# The `ppo` subcommand specifies that this is a PPO experiment.
-# The `save_freq_steps` is set to `null` to disable saving checkpoints.
-# Enable it if you want to save checkpoints.
-# The `ppo` option is used to control the generation and PPO algorithm hyperparameters.
-# Note that the performance of PPO is sensitive to the the pre-trained model and hyperparameters.
-# It's the user's responsibility to tune them appropriately.
-unset CLUSTER_SPEC_PATH
-CLUSTER_SPEC_PATH=/storage/ray/cluster_config_on_ray.json \
-REAL_GPU_MEMORY_KILL_THRESHOLD=1 \
-python3 -m realhf.apps.quickstart ppo-math \
-    mode=$MODE \
-    experiment_name=$EXP_NAME \
-    trial_name=$TRIAL_NAME \
-    wandb.mode=disabled \
-    exp_ctrl.total_train_epochs=10 \
-    exp_ctrl.save_freq_epochs=1 \
-    exp_ctrl.ckpt_freq_secs=600 \
-    group_size=${GROUP_SIZE} \
-    group_adv_norm=False \
-    use_dense_reward=False \
-    reward_delta=True \
-    rw_type=sparse \
-    check_xml_format=False \
-    actor.type._class=$MODEL_FAMILY \
-    actor.path=$BASE_MODEL_PATH \
-    actor.vllm.hybrid_train=False \
-    actor.vllm.enforce_eager=False \
-    actor.vllm.max_seq_len_to_capture=${MAX_SEQ_LEN_TO_CAPTURE} \
-    actor.vllm.max_num_seqs=${MAX_NUM_SEQS} \
-    actor.vllm.gpu_memory_utilization=1 \
-    actor.vllm.swap_space=64 \
-    critic.type._class=$MODEL_FAMILY \
-    critic.type.is_critic=True \
-    critic.init_critic_from_actor=True \
-    critic.path=$BASE_MODEL_PATH\
-    ref.type._class=$MODEL_FAMILY \
-    ref.path=$BASE_MODEL_PATH \
-    rew.type._class=$MODEL_FAMILY \
-    rew.type.is_critic=True \
-    rew.init_critic_from_actor=True \
-    rew.path=$BASE_MODEL_PATH \
-    dataset.path=$DATA_PATH \
-    dataset.max_prompt_len=2048 \
-    dataset.train_bs_n_seqs=${TRAIN_BATCH_SIZE} \
-    ppo.gen.max_new_tokens=${MAX_NEW_TOKENS} \
-    ppo.gen.min_new_tokens=0 \
-    ppo.disable_value=True \
-    ppo.gen.top_p=1 ppo.gen.top_k=1000000 \
-    ppo.gen.use_cuda_graph=True \
-    ppo.gen.force_no_logits_mask=True \
-    ppo.ppo_n_minibatches=${PPO_MBS} \
-    ppo.gen.temperature=1 \
-    ppo.kl_ctl=${KL_CTL} \
-    ppo.value_eps_clip=0.2 \
-    ppo.reward_output_scaling=0.5 \
-    ppo.reward_output_bias=-1.0 \
-    ppo.adv_norm=True ppo.value_norm=True \
-    mask_too_long=False \
-    ppo.discount=1.0 \
-    actor.optimizer.lr=1e-6 \
-    critic.optimizer.lr=5e-6 \
-    actor.optimizer.lr_scheduler_type=constant \
-    actor_gen.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    ref_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    rew_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    critic_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    actor_train.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    critic_train.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    cache_clear_freq=1 \
-    n_nodes=${NODES} \
-    allocation_mode="'${ALLOCATION_MODE}'" n_gpus_per_node=8 \
-    recover_mode=auto \
-    recover_retries=10 \
-    torch_cache_mysophobia=True
--- a/examples/train_batch_1.5B_n1.sh
+++ b/examples/train_batch_1.5B_n1.sh
@ -1,52 +0,0 @@
-#!/bin/bash
-
-SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
-
-EXP_NAME=ppo-zero-distill-1.5B-n1
-MODEL_NAME="DeepSeek-R1-Distill-Qwen-1.5B"
-DATASET_NAME="full_prompts_for_r1_distilled.jsonl"
-NODES=1
-ALLOCATION_MODE="actor_gen:d4p1m2,*:d4p2m1"
-
-LOG_DIR="/storage/ray/train_batch_logs/${EXP_NAME}/$(date +'%Y%m%d-%H%M%S')"
-mkdir -p ${LOG_DIR}
-echo "Log Dir: ${LOG_DIR}"
-
-MAX_WORKERS=$(expr 1 / ${NODES})
-
-FIFO_NAME=$(mktemp -u)
-mkfifo "$FIFO_NAME"
-exec 3<>"$FIFO_NAME"
-rm -f "$FIFO_NAME"
-
-for ((i=0; i<MAX_WORKERS; i++)); do
-    echo >&3
-done
-
-
-ALL_PARAMS=(
-    "${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 8 ${NODES} ${ALLOCATION_MODE} 16384 128 1 0.001"
-    #"${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 8 ${NODES} ${ALLOCATION_MODE} 16384 128 1 0.001"
-    #"${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 8 ${NODES} ${ALLOCATION_MODE} 16384 128 1 0.001"
-)
-
-echo "Task Count: ${#ALL_PARAMS[@]}"
-
-for ((i=0; i<${#ALL_PARAMS[@]}; i++)); do
-    read -u3
-
-    {
-        echo "$(date +"%Y-%m-%d %H:%M.%S") Task $i started: ${ALL_PARAMS[$i]}"
-        bash -c "bash ${SCRIPT_DIR}/train_tiny_on_ray.sh ${ALL_PARAMS[$i]} &> ${LOG_DIR}/${i}.log"
-        echo "$(date +"%Y-%m-%d %H:%M.%S") Task $i completed with exit code: $?, ${ALL_PARAMS[$i]}"
-        #sleep 120
-        echo >&3
-    } &
-
-    #sleep 120
-done
-
-wait
-
-exec 3>&-
-echo "All tasks completed"
--- a/examples/train_batch_1.5B_n16.sh
+++ b/examples/train_batch_1.5B_n16.sh
@ -1,52 +0,0 @@
-#!/bin/bash
-
-SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
-
-EXP_NAME=ppo-zero-distill-1.5B-n16
-MODEL_NAME="DeepSeek-R1-Distill-Qwen-1.5B"
-DATASET_NAME="full_prompts_for_r1_distilled.jsonl"
-NODES=16
-ALLOCATION_MODE="vllm.d64p1m1+d32p2m1"
-
-LOG_DIR="/storage/ray/train_batch_logs/${EXP_NAME}/$(date +'%Y%m%d-%H%M%S')"
-mkdir -p ${LOG_DIR}
-echo "Log Dir: ${LOG_DIR}"
-
-MAX_WORKERS=$(expr 16 / ${NODES})
-
-FIFO_NAME=$(mktemp -u)
-mkfifo "$FIFO_NAME"
-exec 3<>"$FIFO_NAME"
-rm -f "$FIFO_NAME"
-
-for ((i=0; i<MAX_WORKERS; i++)); do
-    echo >&3
-done
-
-
-ALL_PARAMS=(
-    "${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 8 ${NODES} ${ALLOCATION_MODE} 16384 128 1 0.001"
-    #"${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 8 ${NODES} ${ALLOCATION_MODE} 16384 128 1 0.001"
-    #"${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 8 ${NODES} ${ALLOCATION_MODE} 16384 128 1 0.001"
-)
-
-echo "Task Count: ${#ALL_PARAMS[@]}"
-
-for ((i=0; i<${#ALL_PARAMS[@]}; i++)); do
-    read -u3
-
-    {
-        echo "$(date +"%Y-%m-%d %H:%M.%S") Task $i started: ${ALL_PARAMS[$i]}"
-        bash -c "bash ${SCRIPT_DIR}/train_small_on_ray.sh ${ALL_PARAMS[$i]} &> ${LOG_DIR}/${i}.log"
-        echo "$(date +"%Y-%m-%d %H:%M.%S") Task $i completed with exit code: $?, ${ALL_PARAMS[$i]}"
-        sleep 120
-        echo >&3
-    } &
-
-    sleep 120
-done
-
-wait
-
-exec 3>&-
-echo "All tasks completed"
--- a/examples/train_batch_1.5B_n4.sh
+++ b/examples/train_batch_1.5B_n4.sh
@ -1,52 +0,0 @@
-#!/bin/bash
-
-SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
-
-EXP_NAME=ppo-zero-distill-1.5B-n4
-MODEL_NAME="DeepSeek-R1-Distill-Qwen-1.5B"
-DATASET_NAME="full_prompts_for_r1_distilled.jsonl"
-NODES=4
-ALLOCATION_MODE="vllm.d16p1m1+d8p2m1"
-
-LOG_DIR="/storage/ray/train_batch_logs/${EXP_NAME}/$(date +'%Y%m%d-%H%M%S')"
-mkdir -p ${LOG_DIR}
-echo "Log Dir: ${LOG_DIR}"
-
-MAX_WORKERS=$(expr 4 / ${NODES})
-
-FIFO_NAME=$(mktemp -u)
-mkfifo "$FIFO_NAME"
-exec 3<>"$FIFO_NAME"
-rm -f "$FIFO_NAME"
-
-for ((i=0; i<MAX_WORKERS; i++)); do
-    echo >&3
-done
-
-
-ALL_PARAMS=(
-    "${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 8 ${NODES} ${ALLOCATION_MODE} 16384 128 1 0.001"
-    #"${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 8 ${NODES} ${ALLOCATION_MODE} 16384 128 1 0.001"
-    #"${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 8 ${NODES} ${ALLOCATION_MODE} 16384 128 1 0.001"
-)
-
-echo "Task Count: ${#ALL_PARAMS[@]}"
-
-for ((i=0; i<${#ALL_PARAMS[@]}; i++)); do
-    read -u3
-
-    {
-        echo "$(date +"%Y-%m-%d %H:%M.%S") Task $i started: ${ALL_PARAMS[$i]}"
-        bash -c "bash ${SCRIPT_DIR}/train_small_on_ray.sh ${ALL_PARAMS[$i]} &> ${LOG_DIR}/${i}.log"
-        echo "$(date +"%Y-%m-%d %H:%M.%S") Task $i completed with exit code: $?, ${ALL_PARAMS[$i]}"
-        sleep 120
-        echo >&3
-    } &
-
-    sleep 120
-done
-
-wait
-
-exec 3>&-
-echo "All tasks completed"
--- a/examples/train_batch_7B_n16.sh
+++ b/examples/train_batch_7B_n16.sh
@ -1,52 +0,0 @@
-#!/bin/bash
-
-SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
-
-EXP_NAME=ppo-zero-distill-7B-n16
-MODEL_NAME="DeepSeek-R1-Distill-Qwen-7B"
-DATASET_NAME="full_prompts_for_r1_distilled.jsonl"
-NODES=16
-ALLOCATION_MODE="vllm.d16p1m4+d32p2m1"
-
-LOG_DIR="/storage/ray/train_batch_logs/${EXP_NAME}/$(date +'%Y%m%d-%H%M%S')"
-mkdir -p ${LOG_DIR}
-echo "Log Dir: ${LOG_DIR}"
-
-MAX_WORKERS=$(expr 16 / ${NODES})
-
-FIFO_NAME=$(mktemp -u)
-mkfifo "$FIFO_NAME"
-exec 3<>"$FIFO_NAME"
-rm -f "$FIFO_NAME"
-
-for ((i=0; i<MAX_WORKERS; i++)); do
-    echo >&3
-done
-
-
-ALL_PARAMS=(
-    "${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 16 ${NODES} ${ALLOCATION_MODE} 16384 128 4 0.01"
-    #"${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 16 ${NODES} ${ALLOCATION_MODE} 16384 128 4 0.01"
-    #"${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 16 ${NODES} ${ALLOCATION_MODE} 16384 128 4 0.01"
-)
-
-echo "Task Count: ${#ALL_PARAMS[@]}"
-
-for ((i=0; i<${#ALL_PARAMS[@]}; i++)); do
-    read -u3
-
-    {
-        echo "$(date +"%Y-%m-%d %H:%M.%S") Task $i started: ${ALL_PARAMS[$i]}"
-        bash -c "bash ${SCRIPT_DIR}/train_small_on_ray.sh ${ALL_PARAMS[$i]} &> ${LOG_DIR}/${i}.log"
-        echo "$(date +"%Y-%m-%d %H:%M.%S") Task $i completed with exit code: $?, ${ALL_PARAMS[$i]}"
-        sleep 120
-        echo >&3
-    } &
-
-    sleep 120
-done
-
-wait
-
-exec 3>&-
-echo "All tasks completed"
--- a/examples/train_batch_7B_n4.sh
+++ b/examples/train_batch_7B_n4.sh
@ -1,52 +0,0 @@
-#!/bin/bash
-
-SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
-
-EXP_NAME=ppo-zero-distill-7B-n4
-MODEL_NAME="DeepSeek-R1-Distill-Qwen-7B"
-DATASET_NAME="prompts_for_r1_distilled.jsonl"
-NODES=4
-ALLOCATION_MODE="vllm.d4p1m4+d8p2m1"
-
-LOG_DIR="/storage/ray/train_batch_logs/${EXP_NAME}/$(date +'%Y%m%d-%H%M%S')"
-mkdir -p ${LOG_DIR}
-echo "Log Dir: ${LOG_DIR}"
-
-MAX_WORKERS=$(expr 4 / ${NODES})
-
-FIFO_NAME=$(mktemp -u)
-mkfifo "$FIFO_NAME"
-exec 3<>"$FIFO_NAME"
-rm -f "$FIFO_NAME"
-
-for ((i=0; i<MAX_WORKERS; i++)); do
-    echo >&3
-done
-
-
-ALL_PARAMS=(
-    "${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 16 ${NODES} ${ALLOCATION_MODE} 16384 128 4 0.01"
-    #"${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 16 ${NODES} ${ALLOCATION_MODE} 16384 128 4 0.01"
-    #"${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 16 ${NODES} ${ALLOCATION_MODE} 16384 128 4 0.01"
-)
-
-echo "Task Count: ${#ALL_PARAMS[@]}"
-
-for ((i=0; i<${#ALL_PARAMS[@]}; i++)); do
-    read -u3
-
-    {
-        echo "$(date +"%Y-%m-%d %H:%M.%S") Task $i started: ${ALL_PARAMS[$i]}"
-        bash -c "bash ${SCRIPT_DIR}/train_small_on_ray.sh ${ALL_PARAMS[$i]} &> ${LOG_DIR}/${i}.log"
-        echo "$(date +"%Y-%m-%d %H:%M.%S") Task $i completed with exit code: $?, ${ALL_PARAMS[$i]}"
-        sleep 120
-        echo >&3
-    } &
-
-    sleep 120
-done
-
-wait
-
-exec 3>&-
-echo "All tasks completed"
--- a/examples/train_batch_7B_zero_n16.sh
+++ b/examples/train_batch_7B_zero_n16.sh
@ -1,52 +0,0 @@
-#!/bin/bash
-
-SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
-
-EXP_NAME=ppo-zero-7B-zero-n16
-MODEL_NAME="Qwen2.5-7B"
-DATASET_NAME="full_orz_zero.jsonl"
-NODES=16
-ALLOCATION_MODE="vllm.d64p1m1+d32p2m1"
-
-LOG_DIR="/storage/ray/train_batch_logs/${EXP_NAME}/$(date +'%Y%m%d-%H%M%S')"
-mkdir -p ${LOG_DIR}
-echo "Log Dir: ${LOG_DIR}"
-
-MAX_WORKERS=$(expr 16 / ${NODES})
-
-FIFO_NAME=$(mktemp -u)
-mkfifo "$FIFO_NAME"
-exec 3<>"$FIFO_NAME"
-rm -f "$FIFO_NAME"
-
-for ((i=0; i<MAX_WORKERS; i++)); do
-    echo >&3
-done
-
-
-ALL_PARAMS=(
-    "${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 512 64 ${NODES} ${ALLOCATION_MODE} 16384 128 4 0.0"
-    #"${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 512 64 ${NODES} ${ALLOCATION_MODE} 16384 128 4 0.0"
-    #"${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 512 64 ${NODES} ${ALLOCATION_MODE} 16384 128 4 0.0"
-)
-
-echo "Task Count: ${#ALL_PARAMS[@]}"
-
-for ((i=0; i<${#ALL_PARAMS[@]}; i++)); do
-    read -u3
-
-    {
-        echo "$(date +"%Y-%m-%d %H:%M.%S") Task $i started: ${ALL_PARAMS[$i]}"
-        bash -c "bash ${SCRIPT_DIR}/train_zero_on_ray.sh ${ALL_PARAMS[$i]} &> ${LOG_DIR}/${i}.log"
-        echo "$(date +"%Y-%m-%d %H:%M.%S") Task $i completed with exit code: $?, ${ALL_PARAMS[$i]}"
-        sleep 120
-        echo >&3
-    } &
-
-    sleep 120
-done
-
-wait
-
-exec 3>&-
-echo "All tasks completed"
--- a/examples/train_small_on_ray.sh
+++ b/examples/train_small_on_ray.sh
@ -1,115 +0,0 @@
-#!/bin/sh
-MODEL_FAMILY=qwen2
-
-EXP_NAME="$1"
-MODEL_NAME="$2"
-DATASET_NAME="$3"
-TRAIN_BATCH_SIZE="$4"
-GROUP_SIZE="$5"
-NODES="$6"
-ALLOCATION_MODE="$7"
-MAX_NEW_TOKENS=$8
-MAX_NUM_SEQS=$9
-PPO_MBS=${10}
-KL_CTL=${11}
-
-MAX_TOKEN_PER_MB=$(expr 2048 + ${MAX_NEW_TOKENS} + 1024)
-MAX_SEQ_LEN_TO_CAPTURE=$(expr 2048 + ${MAX_NEW_TOKENS})
-
-BASE_MODEL_PATH="/storage/models/${MODEL_NAME}"
-
-# original data
-DATA_PATH="/storage/datasets/${DATASET_NAME}"
-
-# Option 1: The experiment runs locally with subprocesses.
-# MODE=local
-# Option 2: The experiment runs in a Ray cluster
-# MODE=ray
-# Option 3: The experiment runs in a SLURM + pyxis cluster
-# Using the slurm mode requires a cluster spec file
-# and setting CLUSTER_SPEC_PATH to the path of it.
-MODE=ray
-
-# `experiment_name` and `trial_name` can be arbitrary.
-# Logs and saved checkpoints will be indexed by them.
-#EXP_NAME=ppo-zero--${MODEL_NAME}--${DATASET_NAME}
-#EXP_NAME=ppo-zero-distill-1.5B-default
-TRIAL_NAME="${TRAIN_BATCH_SIZE}x${GROUP_SIZE}-n${NODES}"
-
-# We use the "heuristic" allocation mode here to automatically determine the parallelism strategy
-# for each model function call, i.e., actor generation, critic inference, actor train, etc.
-# The number of GPUs is `n_nodes` * `n_gpus_per_node` (not set explictly here, defaults to 8).
-# ReaL will make full use of these available GPUs to design allocations.
-# This does not ensure the optimal throughput, but it is a good starting point.
-
-# The `heuristic` allocation mode is not ensured to run with every model configurations.
-# For example, if the vocabulary size is an odd number, the model parallelism may not work.
-# In these cases, you can use the `ppo_manual.sh` to specify the parallelism strategy manually.
-
-# The `ppo` subcommand specifies that this is a PPO experiment.
-# The `save_freq_steps` is set to `null` to disable saving checkpoints.
-# Enable it if you want to save checkpoints.
-# The `ppo` option is used to control the generation and PPO algorithm hyperparameters.
-# Note that the performance of PPO is sensitive to the the pre-trained model and hyperparameters.
-# It's the user's responsibility to tune them appropriately.
-unset CLUSTER_SPEC_PATH
-CLUSTER_SPEC_PATH=/storage/ray/cluster_config_on_ray.json \
-REAL_GPU_MEMORY_KILL_THRESHOLD=1 \
-python3 -m realhf.apps.quickstart ppo-math \
-    mode=$MODE \
-    experiment_name=$EXP_NAME \
-    trial_name=$TRIAL_NAME \
-    wandb.mode=disabled \
-    exp_ctrl.total_train_epochs=10 \
-    exp_ctrl.save_freq_epochs=1 \
-    exp_ctrl.ckpt_freq_secs=600 \
-    group_size=${GROUP_SIZE} \
-    group_adv_norm=False \
-    rw_type=sparse \
-    check_xml_format=False \
-    actor.type._class=$MODEL_FAMILY \
-    actor.path=$BASE_MODEL_PATH \
-    actor.vllm.hybrid_train=False \
-    actor.vllm.enforce_eager=False \
-    actor.vllm.max_seq_len_to_capture=${MAX_SEQ_LEN_TO_CAPTURE} \
-    actor.vllm.max_num_seqs=${MAX_NUM_SEQS} \
-    actor.vllm.gpu_memory_utilization=0.85 \
-    actor.vllm.swap_space=64 \
-    critic.type._class=$MODEL_FAMILY \
-    critic.type.is_critic=True \
-    critic.init_critic_from_actor=True \
-    critic.path=$BASE_MODEL_PATH\
-    ref.type._class=$MODEL_FAMILY \
-    ref.path=$BASE_MODEL_PATH \
-    dataset.path=$DATA_PATH \
-    dataset.max_prompt_len=2048 \
-    dataset.train_bs_n_seqs=${TRAIN_BATCH_SIZE} \
-    ppo.gen.max_new_tokens=${MAX_NEW_TOKENS} \
-    ppo.gen.min_new_tokens=0 \
-    ppo.disable_value=True \
-    ppo.gen.top_p=1 ppo.gen.top_k=1000000 \
-    ppo.ppo_n_minibatches=${PPO_MBS} \
-    ppo.gen.temperature=0.6 \
-    ppo.kl_ctl=${KL_CTL} \
-    ppo.value_eps_clip=0.2 \
-    ppo.reward_output_scaling=5 \
-    ppo.reward_output_bias=0.0 \
-    ppo.adv_norm=True ppo.value_norm=True \
-    ppo.fuse_rew_ref=False \
-    mask_too_long=False \
-    ppo.discount=1.0 \
-    actor.optimizer.lr=1e-6 \
-    critic.optimizer.lr=5e-6 \
-    actor.optimizer.lr_scheduler_type=constant \
-    actor_gen.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    ref_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    actor_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    critic_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    actor_train.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    critic_train.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    cache_clear_freq=1 \
-    n_nodes=${NODES} \
-    allocation_mode="'${ALLOCATION_MODE}'" n_gpus_per_node=8 \
-    recover_mode=auto \
-    recover_retries=10 \
-    torch_cache_mysophobia=True
--- a/examples/train_tiny_on_ray.sh
+++ b/examples/train_tiny_on_ray.sh
@ -1,115 +0,0 @@
-#!/bin/sh
-MODEL_FAMILY=qwen2
-
-EXP_NAME="$1"
-MODEL_NAME="$2"
-DATASET_NAME="$3"
-TRAIN_BATCH_SIZE="$4"
-GROUP_SIZE="$5"
-NODES="$6"
-ALLOCATION_MODE="$7"
-MAX_NEW_TOKENS=$8
-MAX_NUM_SEQS=$9
-PPO_MBS=${10}
-KL_CTL=${11}
-
-MAX_TOKEN_PER_MB=$(expr 2048 + ${MAX_NEW_TOKENS} + 1024)
-MAX_SEQ_LEN_TO_CAPTURE=$(expr 2048 + ${MAX_NEW_TOKENS})
-
-BASE_MODEL_PATH="/storage/models/${MODEL_NAME}"
-
-# original data
-DATA_PATH="/storage/datasets/${DATASET_NAME}"
-
-# Option 1: The experiment runs locally with subprocesses.
-# MODE=local
-# Option 2: The experiment runs in a Ray cluster
-# MODE=ray
-# Option 3: The experiment runs in a SLURM + pyxis cluster
-# Using the slurm mode requires a cluster spec file
-# and setting CLUSTER_SPEC_PATH to the path of it.
-MODE=ray
-
-# `experiment_name` and `trial_name` can be arbitrary.
-# Logs and saved checkpoints will be indexed by them.
-#EXP_NAME=ppo-zero--${MODEL_NAME}--${DATASET_NAME}
-#EXP_NAME=ppo-zero-distill-1.5B-default
-TRIAL_NAME="${TRAIN_BATCH_SIZE}x${GROUP_SIZE}-n${NODES}"
-
-# We use the "heuristic" allocation mode here to automatically determine the parallelism strategy
-# for each model function call, i.e., actor generation, critic inference, actor train, etc.
-# The number of GPUs is `n_nodes` * `n_gpus_per_node` (not set explictly here, defaults to 8).
-# ReaL will make full use of these available GPUs to design allocations.
-# This does not ensure the optimal throughput, but it is a good starting point.
-
-# The `heuristic` allocation mode is not ensured to run with every model configurations.
-# For example, if the vocabulary size is an odd number, the model parallelism may not work.
-# In these cases, you can use the `ppo_manual.sh` to specify the parallelism strategy manually.
-
-# The `ppo` subcommand specifies that this is a PPO experiment.
-# The `save_freq_steps` is set to `null` to disable saving checkpoints.
-# Enable it if you want to save checkpoints.
-# The `ppo` option is used to control the generation and PPO algorithm hyperparameters.
-# Note that the performance of PPO is sensitive to the the pre-trained model and hyperparameters.
-# It's the user's responsibility to tune them appropriately.
-unset CLUSTER_SPEC_PATH
-CLUSTER_SPEC_PATH=/storage/ray/cluster_config_on_ray.json \
-REAL_GPU_MEMORY_KILL_THRESHOLD=1 \
-python3 -m realhf.apps.quickstart ppo-math \
-    mode=$MODE \
-    experiment_name=$EXP_NAME \
-    trial_name=$TRIAL_NAME \
-    wandb.mode=disabled \
-    exp_ctrl.total_train_epochs=10 \
-    exp_ctrl.save_freq_epochs=1 \
-    exp_ctrl.ckpt_freq_secs=600 \
-    group_size=${GROUP_SIZE} \
-    group_adv_norm=False \
-    rw_type=sparse \
-    check_xml_format=False \
-    actor.type._class=$MODEL_FAMILY \
-    actor.path=$BASE_MODEL_PATH \
-    actor.vllm.hybrid_train=True \
-    actor.vllm.enforce_eager=True \
-    actor.vllm.max_seq_len_to_capture=${MAX_SEQ_LEN_TO_CAPTURE} \
-    actor.vllm.max_num_seqs=${MAX_NUM_SEQS} \
-    actor.vllm.gpu_memory_utilization=0.8 \
-    actor.vllm.swap_space=64 \
-    critic.type._class=$MODEL_FAMILY \
-    critic.type.is_critic=True \
-    critic.init_critic_from_actor=True \
-    critic.path=$BASE_MODEL_PATH\
-    ref.type._class=$MODEL_FAMILY \
-    ref.path=$BASE_MODEL_PATH \
-    dataset.path=$DATA_PATH \
-    dataset.max_prompt_len=2048 \
-    dataset.train_bs_n_seqs=${TRAIN_BATCH_SIZE} \
-    ppo.gen.max_new_tokens=${MAX_NEW_TOKENS} \
-    ppo.gen.min_new_tokens=0 \
-    ppo.disable_value=True \
-    ppo.gen.top_p=1 ppo.gen.top_k=1000000 \
-    ppo.ppo_n_minibatches=${PPO_MBS} \
-    ppo.gen.temperature=0.6 \
-    ppo.kl_ctl=${KL_CTL} \
-    ppo.value_eps_clip=0.2 \
-    ppo.reward_output_scaling=5 \
-    ppo.reward_output_bias=0.0 \
-    ppo.adv_norm=True ppo.value_norm=True \
-    ppo.fuse_rew_ref=False \
-    mask_too_long=False \
-    ppo.discount=1.0 \
-    actor.optimizer.lr=1e-6 \
-    critic.optimizer.lr=5e-6 \
-    actor.optimizer.lr_scheduler_type=constant \
-    actor_gen.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    ref_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    actor_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    critic_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    actor_train.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    critic_train.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    cache_clear_freq=1 \
-    n_nodes=${NODES} \
-    allocation_mode="'${ALLOCATION_MODE}'" n_gpus_per_node=8 \
-    recover_mode=auto \
-    recover_retries=10 \
-    torch_cache_mysophobia=True
--- a/examples/train_zero_on_ray.sh
+++ b/examples/train_zero_on_ray.sh
@ -1,115 +0,0 @@
-#!/bin/sh
-MODEL_FAMILY=qwen2
-
-EXP_NAME="$1"
-MODEL_NAME="$2"
-DATASET_NAME="$3"
-TRAIN_BATCH_SIZE="$4"
-GROUP_SIZE="$5"
-NODES="$6"
-ALLOCATION_MODE="$7"
-MAX_NEW_TOKENS=$8
-MAX_NUM_SEQS=$9
-PPO_MBS=${10}
-KL_CTL=${11}
-
-MAX_TOKEN_PER_MB=$(expr 2048 + ${MAX_NEW_TOKENS} + 1024)
-MAX_SEQ_LEN_TO_CAPTURE=$(expr 2048 + ${MAX_NEW_TOKENS})
-
-BASE_MODEL_PATH="/storage/models/${MODEL_NAME}"
-
-# original data
-DATA_PATH="/storage/datasets/${DATASET_NAME}"
-
-# Option 1: The experiment runs locally with subprocesses.
-# MODE=local
-# Option 2: The experiment runs in a Ray cluster
-# MODE=ray
-# Option 3: The experiment runs in a SLURM + pyxis cluster
-# Using the slurm mode requires a cluster spec file
-# and setting CLUSTER_SPEC_PATH to the path of it.
-MODE=ray
-
-# `experiment_name` and `trial_name` can be arbitrary.
-# Logs and saved checkpoints will be indexed by them.
-#EXP_NAME=ppo-zero--${MODEL_NAME}--${DATASET_NAME}
-#EXP_NAME=ppo-zero-distill-1.5B-default
-TRIAL_NAME="${TRAIN_BATCH_SIZE}x${GROUP_SIZE}-n${NODES}"
-
-# We use the "heuristic" allocation mode here to automatically determine the parallelism strategy
-# for each model function call, i.e., actor generation, critic inference, actor train, etc.
-# The number of GPUs is `n_nodes` * `n_gpus_per_node` (not set explictly here, defaults to 8).
-# ReaL will make full use of these available GPUs to design allocations.
-# This does not ensure the optimal throughput, but it is a good starting point.
-
-# The `heuristic` allocation mode is not ensured to run with every model configurations.
-# For example, if the vocabulary size is an odd number, the model parallelism may not work.
-# In these cases, you can use the `ppo_manual.sh` to specify the parallelism strategy manually.
-
-# The `ppo` subcommand specifies that this is a PPO experiment.
-# The `save_freq_steps` is set to `null` to disable saving checkpoints.
-# Enable it if you want to save checkpoints.
-# The `ppo` option is used to control the generation and PPO algorithm hyperparameters.
-# Note that the performance of PPO is sensitive to the the pre-trained model and hyperparameters.
-# It's the user's responsibility to tune them appropriately.
-unset CLUSTER_SPEC_PATH
-CLUSTER_SPEC_PATH=/storage/ray/cluster_config_on_ray.json \
-REAL_GPU_MEMORY_KILL_THRESHOLD=1 \
-python3 -m realhf.apps.quickstart ppo-math \
-    mode=$MODE \
-    experiment_name=$EXP_NAME \
-    trial_name=$TRIAL_NAME \
-    wandb.mode=disabled \
-    exp_ctrl.total_train_epochs=10 \
-    exp_ctrl.save_freq_epochs=1 \
-    exp_ctrl.ckpt_freq_secs=600 \
-    group_size=${GROUP_SIZE} \
-    group_adv_norm=False \
-    rw_type=sparse \
-    check_xml_format=True \
-    actor.type._class=$MODEL_FAMILY \
-    actor.path=$BASE_MODEL_PATH \
-    actor.vllm.hybrid_train=False \
-    actor.vllm.enforce_eager=False \
-    actor.vllm.max_seq_len_to_capture=${MAX_SEQ_LEN_TO_CAPTURE} \
-    actor.vllm.max_num_seqs=${MAX_NUM_SEQS} \
-    actor.vllm.gpu_memory_utilization=0.9 \
-    actor.vllm.swap_space=64 \
-    critic.type._class=$MODEL_FAMILY \
-    critic.type.is_critic=True \
-    critic.init_critic_from_actor=True \
-    critic.path=$BASE_MODEL_PATH\
-    ref.type._class=$MODEL_FAMILY \
-    ref.path=$BASE_MODEL_PATH \
-    dataset.path=$DATA_PATH \
-    dataset.max_prompt_len=2048 \
-    dataset.train_bs_n_seqs=${TRAIN_BATCH_SIZE} \
-    ppo.gen.max_new_tokens=${MAX_NEW_TOKENS} \
-    ppo.gen.min_new_tokens=0 \
-    ppo.disable_value=True \
-    ppo.gen.top_p=1 ppo.gen.top_k=1000000 \
-    ppo.ppo_n_minibatches=${PPO_MBS} \
-    ppo.gen.temperature=1 \
-    ppo.kl_ctl=${KL_CTL} \
-    ppo.value_eps_clip=0.2 \
-    ppo.reward_output_scaling=0.5 \
-    ppo.reward_output_bias=-1.0 \
-    ppo.adv_norm=True ppo.value_norm=True \
-    ppo.fuse_rew_ref=False \
-    mask_too_long=False \
-    ppo.discount=1.0 \
-    actor.optimizer.lr=1e-6 \
-    critic.optimizer.lr=5e-6 \
-    actor.optimizer.lr_scheduler_type=constant \
-    actor_gen.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    ref_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    actor_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    critic_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    actor_train.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    critic_train.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
-    cache_clear_freq=1 \
-    n_nodes=${NODES} \
-    allocation_mode="'${ALLOCATION_MODE}'" n_gpus_per_node=8 \
-    recover_mode=auto \
-    recover_retries=10 \
-    torch_cache_mysophobia=True
--- a/realhf/apps/quickstart.py
+++ b/realhf/apps/quickstart.py
@ -131,8 +131,9 @@ def prepare_hydra_config(name: str, prologue_path: str):
    experiment_name = get_experiment_name(config.get("experiment_name"))
    trial_name = get_trial_name(config.get("trial_name"))
    config_dir = f"{cluster_spec.fileroot}/configs/{getpass.getuser()}/{experiment_name}/{trial_name}"
+    os.makedirs(config_dir, exist_ok=True)

-    config.pop(PROLOGUE_EXTERNAL_CONFIG_NAME)
+    config.pop(PROLOGUE_EXTERNAL_CONFIG_NAME, {})
    with open(f"{config_dir}/{name}.yaml", "w") as f:
        f.write(OmegaConf.to_yaml(config))

@ -144,7 +145,7 @@ def launch_hydra_task(
 ):
    # Disable hydra logging.
    if not any("hydra/job_logging=disabled" in x for x in sys.argv):
-        sys.argv += ["hydra/job_logging=disabled"]
+        sys.argv.insert(2, "hydra/job_logging=disabled")

    if (
        "--multirun" in sys.argv
@ -154,10 +155,11 @@ def launch_hydra_task(
        raise NotImplementedError("Hydra multi-run is not supported.")

    # non-multirun mode, add hydra run dir
-    sys.argv += [
+    sys.argv.insert(
+        2,
        f"hydra.run.dir={cluster_spec.fileroot}/logs/{getpass.getuser()}/"
-        f"{experiment_name}/{trial_name}/hydra-outputs/"
-    ]
+        f"{experiment_name}/{trial_name}/hydra-outputs/",
+    )

    sys.argv.pop(1)

--- a/realhf/base/constants.py
+++ b/realhf/base/constants.py
@ -59,10 +59,9 @@ class GlobalMemoryBuffer:
        return res


-# 30 minutes. Transferring super-large batches via NCCL bcast
-# for the first time may consumer over 600 secs, which is the
-# pytorch's default. Increase this value to 30 minutes.
-NCCL_DEFAULT_TIMEOUT = datetime.timedelta(seconds=1800)
+# For large models, generation may consume more than 3600s.
+# We set a large value to avoid NCCL timeout issues during generaiton.
+NCCL_DEFAULT_TIMEOUT = datetime.timedelta(seconds=7200)

 # We may want to use CPU for testing even when CUDA is available.
 TORCH_FORCE_CPU = False
--- a/realhf/base/prologue.py
+++ b/realhf/base/prologue.py
@ -30,7 +30,7 @@ def global_init():
        return

    # add externel envs.
-    if external_configs.envs is not None:
+    if external_configs.get("envs"):
        for key, value in external_configs.envs.items():
            if key not in os.environ:
                os.environ[key] = value
@ -38,8 +38,8 @@ def global_init():
    # resolve config path for cluster spec.
    cluster_spec_path = os.environ.get("CLUSTER_SPEC_PATH", "")
    if cluster_spec_path == "":
-        if external_configs.cluster_config is not None:
-            fileroot = external_configs.cluster_config.fileroot
+        if external_configs.get("cluster_config"):
+            fileroot = external_configs.cluster_config.get("fileroot")
            if fileroot is not None and fileroot != "":
                experiment_name = get_experiment_name(config.get("experiment_name"))
                trial_name = get_trial_name(config.get("trial_name"))