add a preprocessing script for code training data and update readme (#126)

* add a preprocessing script for code training data and update readme

* add a preprocessing script for code training data and update readme

* add a preprocessing script for code training data and update readme

* fix eval doc

---------

Co-authored-by: hcy <hechuyi.hcy@antgroup.com>
This commit is contained in:
GurrenLagann97 2025-06-24 09:44:15 +08:00 committed by GitHub
parent 3642cce2fc
commit e3005d57f6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 88 additions and 1 deletions

View File

@ -102,9 +102,16 @@ AReaL-boba² allows you to independently customize the [dataset](https://inclusi
In particular, we show a simple example to develop a multi-turn math agent for RL training. Please see the learning curve below and reference the [step-by-step guide](https://inclusionai.github.io/AReaL/customization/agent.html) if you want to implement your own agentic RL project.
## Getting Started
Obtain the training data:
- [Math](https://huggingface.co/datasets/inclusionAI/AReaL-boba-Data)
- [Code](https://huggingface.co/datasets/inclusionAI/AReaL-boba-2-RL-Code)
Train Qwen3 1.7B locally:
For code training data, a simple preprocessing script was provided in `examples/data_preprocess/preprocess_training_data.py`:
```bash
python3 preprocess_training_data.py --data_path $original_data_path --output_path $training_data_path
```
Train Qwen3 1.7B locally (Remember to modify `dataset.path` in the script below):
```bash
bash examples/run_async_ppo.sh
```

View File

@ -51,6 +51,8 @@ nohup python eval_and_aggregate.py \
--max_gen_tokens 32768 \
--data_names codeforces,lcb_v5 \
--prompt_type qwen3-think-pure \
--temperature 1.0 \
--top_p 0.95 \
--num_sample_nodes 8 \
--samples_per_node 1 \
--n_sampling $((num_sample_nodes * samples_per_node)) \

View File

@ -0,0 +1,78 @@
import json
import sys
from argparse import ArgumentParser
from typing import Dict, List
# An example of prompt template, please remember to add special tokens, this example is for boba-2 coding dataset
prompt_template = """
<|im_start|>user\n{question}\n/think<|im_end|>\n<|im_start|>assistant\n<think>
"""
def load_jsonl(file_path: str) -> List[Dict]:
"""Load JSONL file with validation"""
try:
with open(file_path, "r", encoding="utf-8") as f:
return [json.loads(line) for line in f]
except FileNotFoundError:
print(f"ERROR: JSONL file not found: {file_path}")
raise
except json.JSONDecodeError as e:
print(f"ERROR: JSON parsing failed in {file_path}: {str(e)}")
raise
def process_code_data(file_path: str) -> List[Dict]:
"""Process code dataset from JSONL file"""
if not file_path:
return []
raw_data = load_jsonl(file_path)
processed = []
for item in raw_data:
# Field extraction and transformation
input_output = json.loads(item["input_output"])
processed.append(
{
"task": "code",
"query_id": item["query_id"],
"prompt": prompt_template.format(question=item["question"]),
"solutions": item.get("solutions", []), # nothing for code dataset
"input_output": json.dumps(
{
"inputs": input_output.get("inputs", []),
"outputs": input_output.get("outputs", []),
"fn_name": item.get("metadata", {}).get("fn_name", ""),
"remote": False,
}
),
"language": item.get("language", "PYTHON"), # default to python
}
)
case_size = sys.getsizeof(processed[-1]["input_output"])
assert (
case_size < 500 * 1024
), f"'input_output' exceeds 500KB ({case_size} bytes). Use remote testcase instead."
return processed
def parse_args():
parser = ArgumentParser()
parser.add_argument("--data_path", type=str, required=True)
parser.add_argument("--output_path", type=str, required=True)
return parser.parse_args()
def main():
args = parse_args()
processed_data = process_code_data(args.data_path)
with open(args.output_path, "w") as f:
for item in processed_data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
if __name__ == "__main__":
main()