From e3005d57f63fc654d5d69a03df6cd15f28b046b4 Mon Sep 17 00:00:00 2001
From: GurrenLagann97 <162653182+GurrenLagann97@users.noreply.github.com>
Date: Tue, 24 Jun 2025 09:44:15 +0800
Subject: [PATCH] add a preprocessing script for code training data and update
 readme (#126)

* add a preprocessing script for code training data and update readme

* add a preprocessing script for code training data and update readme

* add a preprocessing script for code training data and update readme

* fix eval doc

---------

Co-authored-by: hcy <hechuyi.hcy@antgroup.com>
---
 README.md                                     |  9 ++-
 docs/tutorial/eval.md                         |  2 +
 .../preprocess_training_data.py               | 78 +++++++++++++++++++
 3 files changed, 88 insertions(+), 1 deletion(-)
 create mode 100644 examples/data_preprocess/preprocess_training_data.py
diff --git a/README.md b/README.md
index e8f99c1..5865843 100644
--- a/README.md
+++ b/README.md
@@ -102,9 +102,16 @@ AReaL-boba² allows you to independently customize the [dataset](https://inclusi
 In particular, we show a simple example to develop a multi-turn math agent for RL training. Please see the learning curve below and reference the [step-by-step guide](https://inclusionai.github.io/AReaL/customization/agent.html) if you want to implement your own agentic RL project.
 
 ## Getting Started
+Obtain the training data:
+- [Math](https://huggingface.co/datasets/inclusionAI/AReaL-boba-Data) 
+- [Code](https://huggingface.co/datasets/inclusionAI/AReaL-boba-2-RL-Code)
 
-Train Qwen3 1.7B locally:
+For code training data, a simple preprocessing script was provided in `examples/data_preprocess/preprocess_training_data.py`:
+```bash
+python3 preprocess_training_data.py --data_path $original_data_path --output_path $training_data_path
+```
 
+Train Qwen3 1.7B locally (Remember to modify `dataset.path` in the script below):
 ```bash
 bash examples/run_async_ppo.sh
 ```
diff --git a/docs/tutorial/eval.md b/docs/tutorial/eval.md
index 96c4aee..c3cc250 100644
--- a/docs/tutorial/eval.md
+++ b/docs/tutorial/eval.md
@@ -51,6 +51,8 @@ nohup python eval_and_aggregate.py \
     --max_gen_tokens 32768 \
     --data_names codeforces,lcb_v5 \
     --prompt_type qwen3-think-pure \
+    --temperature 1.0 \
+    --top_p 0.95 \
     --num_sample_nodes 8 \
     --samples_per_node 1 \
     --n_sampling $((num_sample_nodes * samples_per_node)) \
diff --git a/examples/data_preprocess/preprocess_training_data.py b/examples/data_preprocess/preprocess_training_data.py
new file mode 100644
index 0000000..bef5f05
--- /dev/null
+++ b/examples/data_preprocess/preprocess_training_data.py
@@ -0,0 +1,78 @@
+import json
+import sys
+from argparse import ArgumentParser
+from typing import Dict, List
+
+# An example of prompt template, please remember to add special tokens, this example is for boba-2 coding dataset
+prompt_template = """
+<|im_start|>user\n{question}\n/think<|im_end|>\n<|im_start|>assistant\n<think>
+"""
+
+
+def load_jsonl(file_path: str) -> List[Dict]:
+    """Load JSONL file with validation"""
+    try:
+        with open(file_path, "r", encoding="utf-8") as f:
+            return [json.loads(line) for line in f]
+    except FileNotFoundError:
+        print(f"ERROR: JSONL file not found: {file_path}")
+        raise
+    except json.JSONDecodeError as e:
+        print(f"ERROR: JSON parsing failed in {file_path}: {str(e)}")
+        raise
+
+
+def process_code_data(file_path: str) -> List[Dict]:
+    """Process code dataset from JSONL file"""
+    if not file_path:
+        return []
+
+    raw_data = load_jsonl(file_path)
+    processed = []
+
+    for item in raw_data:
+        # Field extraction and transformation
+        input_output = json.loads(item["input_output"])
+        processed.append(
+            {
+                "task": "code",
+                "query_id": item["query_id"],
+                "prompt": prompt_template.format(question=item["question"]),
+                "solutions": item.get("solutions", []),  # nothing for code dataset
+                "input_output": json.dumps(
+                    {
+                        "inputs": input_output.get("inputs", []),
+                        "outputs": input_output.get("outputs", []),
+                        "fn_name": item.get("metadata", {}).get("fn_name", ""),
+                        "remote": False,
+                    }
+                ),
+                "language": item.get("language", "PYTHON"),  # default to python
+            }
+        )
+
+        case_size = sys.getsizeof(processed[-1]["input_output"])
+        assert (
+            case_size < 500 * 1024
+        ), f"'input_output' exceeds 500KB ({case_size} bytes). Use remote testcase instead."
+
+    return processed
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument("--data_path", type=str, required=True)
+    parser.add_argument("--output_path", type=str, required=True)
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    processed_data = process_code_data(args.data_path)
+    with open(args.output_path, "w") as f:
+        for item in processed_data:
+            f.write(json.dumps(item, ensure_ascii=False) + "\n")
+
+
+if __name__ == "__main__":
+    main()