From e3005d57f63fc654d5d69a03df6cd15f28b046b4 Mon Sep 17 00:00:00 2001 From: GurrenLagann97 <162653182+GurrenLagann97@users.noreply.github.com> Date: Tue, 24 Jun 2025 09:44:15 +0800 Subject: [PATCH] add a preprocessing script for code training data and update readme (#126) * add a preprocessing script for code training data and update readme * add a preprocessing script for code training data and update readme * add a preprocessing script for code training data and update readme * fix eval doc --------- Co-authored-by: hcy --- README.md | 9 ++- docs/tutorial/eval.md | 2 + .../preprocess_training_data.py | 78 +++++++++++++++++++ 3 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 examples/data_preprocess/preprocess_training_data.py diff --git a/README.md b/README.md index e8f99c1..5865843 100644 --- a/README.md +++ b/README.md @@ -102,9 +102,16 @@ AReaL-boba² allows you to independently customize the [dataset](https://inclusi In particular, we show a simple example to develop a multi-turn math agent for RL training. Please see the learning curve below and reference the [step-by-step guide](https://inclusionai.github.io/AReaL/customization/agent.html) if you want to implement your own agentic RL project. ## Getting Started +Obtain the training data: +- [Math](https://huggingface.co/datasets/inclusionAI/AReaL-boba-Data) +- [Code](https://huggingface.co/datasets/inclusionAI/AReaL-boba-2-RL-Code) -Train Qwen3 1.7B locally: +For code training data, a simple preprocessing script was provided in `examples/data_preprocess/preprocess_training_data.py`: +```bash +python3 preprocess_training_data.py --data_path $original_data_path --output_path $training_data_path +``` +Train Qwen3 1.7B locally (Remember to modify `dataset.path` in the script below): ```bash bash examples/run_async_ppo.sh ``` diff --git a/docs/tutorial/eval.md b/docs/tutorial/eval.md index 96c4aee..c3cc250 100644 --- a/docs/tutorial/eval.md +++ b/docs/tutorial/eval.md @@ -51,6 +51,8 @@ nohup python eval_and_aggregate.py \ --max_gen_tokens 32768 \ --data_names codeforces,lcb_v5 \ --prompt_type qwen3-think-pure \ + --temperature 1.0 \ + --top_p 0.95 \ --num_sample_nodes 8 \ --samples_per_node 1 \ --n_sampling $((num_sample_nodes * samples_per_node)) \ diff --git a/examples/data_preprocess/preprocess_training_data.py b/examples/data_preprocess/preprocess_training_data.py new file mode 100644 index 0000000..bef5f05 --- /dev/null +++ b/examples/data_preprocess/preprocess_training_data.py @@ -0,0 +1,78 @@ +import json +import sys +from argparse import ArgumentParser +from typing import Dict, List + +# An example of prompt template, please remember to add special tokens, this example is for boba-2 coding dataset +prompt_template = """ +<|im_start|>user\n{question}\n/think<|im_end|>\n<|im_start|>assistant\n +""" + + +def load_jsonl(file_path: str) -> List[Dict]: + """Load JSONL file with validation""" + try: + with open(file_path, "r", encoding="utf-8") as f: + return [json.loads(line) for line in f] + except FileNotFoundError: + print(f"ERROR: JSONL file not found: {file_path}") + raise + except json.JSONDecodeError as e: + print(f"ERROR: JSON parsing failed in {file_path}: {str(e)}") + raise + + +def process_code_data(file_path: str) -> List[Dict]: + """Process code dataset from JSONL file""" + if not file_path: + return [] + + raw_data = load_jsonl(file_path) + processed = [] + + for item in raw_data: + # Field extraction and transformation + input_output = json.loads(item["input_output"]) + processed.append( + { + "task": "code", + "query_id": item["query_id"], + "prompt": prompt_template.format(question=item["question"]), + "solutions": item.get("solutions", []), # nothing for code dataset + "input_output": json.dumps( + { + "inputs": input_output.get("inputs", []), + "outputs": input_output.get("outputs", []), + "fn_name": item.get("metadata", {}).get("fn_name", ""), + "remote": False, + } + ), + "language": item.get("language", "PYTHON"), # default to python + } + ) + + case_size = sys.getsizeof(processed[-1]["input_output"]) + assert ( + case_size < 500 * 1024 + ), f"'input_output' exceeds 500KB ({case_size} bytes). Use remote testcase instead." + + return processed + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument("--data_path", type=str, required=True) + parser.add_argument("--output_path", type=str, required=True) + return parser.parse_args() + + +def main(): + args = parse_args() + processed_data = process_code_data(args.data_path) + with open(args.output_path, "w") as f: + for item in processed_data: + f.write(json.dumps(item, ensure_ascii=False) + "\n") + + +if __name__ == "__main__": + main()