PullRequest: 17 Support functioncall for math and code verify

Merge branch functioncall-code of git@code.alipay.com:inclusionAI/AReaL.git into main https://code.alipay.com/inclusionAI/AReaL/pull_requests/17 Signed-off-by: 晓雷 <meizhiyu.mzy@antgroup.com> * test code evaluation with faas * support functioncall for code * fix code crash bug * format * .
2025-03-07 11:26:57 +08:00 · 2025-03-07 11:26:57 +08:00 · b3bedd7b9d
parent c1bb4770ff
commit b3bedd7b9d
26 changed files with 3968 additions and 17 deletions
--- a/examples/train_code_batch_1.5B_n16.sh
+++ b/examples/train_code_batch_1.5B_n16.sh
@ -0,0 +1,52 @@
+#!/bin/bash
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+
+EXP_NAME=ppo-zero-distill-1.5B-n16-jun1
+MODEL_NAME="DeepSeek-R1-Distill-Qwen-1.5B"
+DATASET_NAME="codeparrot-apps-test.jsonl"
+NODES=16
+ALLOCATION_MODE="vllm.d64p1m1+d32p2m1"
+
+LOG_DIR="/storage/ray/train_batch_logs/${EXP_NAME}/$(date +'%Y%m%d-%H%M%S')"
+mkdir -p ${LOG_DIR}
+echo "Log Dir: ${LOG_DIR}"
+
+MAX_WORKERS=$(expr 16 / ${NODES})
+
+FIFO_NAME=$(mktemp -u)
+mkfifo "$FIFO_NAME"
+exec 3<>"$FIFO_NAME"
+rm -f "$FIFO_NAME"
+
+for ((i=0; i<MAX_WORKERS; i++)); do
+    echo >&3
+done
+
+
+ALL_PARAMS=(
+    "${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 8 ${NODES} ${ALLOCATION_MODE} 16384 128 1 0.001"
+    #"${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 8 ${NODES} ${ALLOCATION_MODE} 16384 128 1 0.001"
+    #"${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 8 ${NODES} ${ALLOCATION_MODE} 16384 128 1 0.001"
+)
+
+echo "Task Count: ${#ALL_PARAMS[@]}"
+
+for ((i=0; i<${#ALL_PARAMS[@]}; i++)); do
+    read -u3
+
+    {
+        echo "$(date +"%Y-%m-%d %H:%M.%S") Task $i started: ${ALL_PARAMS[$i]}"
+        bash -c "bash ${SCRIPT_DIR}/train_code_small_on_ray.sh ${ALL_PARAMS[$i]} &> ${LOG_DIR}/${i}.log"
+        echo "$(date +"%Y-%m-%d %H:%M.%S") Task $i completed with exit code: $?, ${ALL_PARAMS[$i]}"
+        sleep 120
+        echo >&3
+    } &
+
+    sleep 120
+done
+
+wait
+
+exec 3>&-
+echo "All tasks completed"
--- a/examples/train_code_small_on_ray.sh
+++ b/examples/train_code_small_on_ray.sh
@ -0,0 +1,122 @@
+#!/bin/sh
+MODEL_FAMILY=qwen2
+
+EXP_NAME="$1"
+MODEL_NAME="$2"
+DATASET_NAME="$3"
+TRAIN_BATCH_SIZE="$4"
+GROUP_SIZE="$5"
+NODES="$6"
+ALLOCATION_MODE="$7"
+MAX_NEW_TOKENS=$8
+MAX_NUM_SEQS=$9
+PPO_MBS=${10}
+KL_CTL=${11}
+
+MAX_TOKEN_PER_MB=$(expr 2048 + ${MAX_NEW_TOKENS} + 1024)
+MAX_SEQ_LEN_TO_CAPTURE=$(expr 2048 + ${MAX_NEW_TOKENS})
+
+BASE_MODEL_PATH="/storage/models/${MODEL_NAME}"
+
+# original data
+DATA_PATH="/storage/datasets/${DATASET_NAME}"
+REAL_CODE_METADATA_PATH="/storage/datasets/codeparrot-apps-test.jsonl"
+
+# Option 1: The experiment runs locally with subprocesses.
+# MODE=local
+# Option 2: The experiment runs in a Ray cluster
+# MODE=ray
+# Option 3: The experiment runs in a SLURM + pyxis cluster
+# Using the slurm mode requires a cluster spec file
+# and setting CLUSTER_SPEC_PATH to the path of it.
+MODE=ray
+
+# `experiment_name` and `trial_name` can be arbitrary.
+# Logs and saved checkpoints will be indexed by them.
+#EXP_NAME=ppo-zero--${MODEL_NAME}--${DATASET_NAME}
+#EXP_NAME=ppo-zero-distill-1.5B-default
+TRIAL_NAME="${TRAIN_BATCH_SIZE}x${GROUP_SIZE}-n${NODES}"
+
+# We use the "heuristic" allocation mode here to automatically determine the parallelism strategy
+# for each model function call, i.e., actor generation, critic inference, actor train, etc.
+# The number of GPUs is `n_nodes` * `n_gpus_per_node` (not set explictly here, defaults to 8).
+# ReaL will make full use of these available GPUs to design allocations.
+# This does not ensure the optimal throughput, but it is a good starting point.
+
+# The `heuristic` allocation mode is not ensured to run with every model configurations.
+# For example, if the vocabulary size is an odd number, the model parallelism may not work.
+# In these cases, you can use the `ppo_manual.sh` to specify the parallelism strategy manually.
+
+# The `ppo` subcommand specifies that this is a PPO experiment.
+# The `save_freq_steps` is set to `null` to disable saving checkpoints.
+# Enable it if you want to save checkpoints.
+# The `ppo` option is used to control the generation and PPO algorithm hyperparameters.
+# Note that the performance of PPO is sensitive to the the pre-trained model and hyperparameters.
+# It's the user's responsibility to tune them appropriately.
+unset CLUSTER_SPEC_PATH
+CLUSTER_SPEC_PATH=/storage/ray/cluster_config_on_ray.json \
+REAL_CODE_METADATA_PATH=${REAL_CODE_METADATA_PATH} \
+REAL_GPU_MEMORY_KILL_THRESHOLD=1 \
+python3 -m realhf.apps.quickstart ppo-code \
+    mode=$MODE \
+    experiment_name=$EXP_NAME \
+    trial_name=$TRIAL_NAME \
+    wandb.mode=disabled \
+    exp_ctrl.total_train_epochs=1 \
+    exp_ctrl.save_freq_epochs=1 \
+    exp_ctrl.ckpt_freq_secs=600 \
+    group_size=${GROUP_SIZE} \
+    group_adv_norm=False \
+    use_dense_reward=False \
+    reward_delta=True \
+    rw_type=sparse \
+    check_xml_format=False \
+    actor.type._class=$MODEL_FAMILY \
+    actor.path=$BASE_MODEL_PATH \
+    actor.vllm.hybrid_train=False \
+    actor.vllm.enforce_eager=False \
+    actor.vllm.max_seq_len_to_capture=${MAX_SEQ_LEN_TO_CAPTURE} \
+    actor.vllm.max_num_seqs=${MAX_NUM_SEQS} \
+    actor.vllm.gpu_memory_utilization=1 \
+    actor.vllm.swap_space=64 \
+    critic.type._class=$MODEL_FAMILY \
+    critic.type.is_critic=True \
+    critic.init_critic_from_actor=True \
+    critic.path=$BASE_MODEL_PATH\
+    ref.type._class=$MODEL_FAMILY \
+    ref.path=$BASE_MODEL_PATH \
+    rew.type._class=$MODEL_FAMILY \
+    rew.type.is_critic=True \
+    rew.init_critic_from_actor=True \
+    rew.path=$BASE_MODEL_PATH \
+    dataset.path=$DATA_PATH \
+    dataset.max_prompt_len=2048 \
+    dataset.train_bs_n_seqs=${TRAIN_BATCH_SIZE} \
+    ppo.gen.max_new_tokens=${MAX_NEW_TOKENS} \
+    ppo.gen.min_new_tokens=0 \
+    ppo.disable_value=True \
+    ppo.gen.top_p=1 ppo.gen.top_k=1000000 \
+    ppo.ppo_n_minibatches=${PPO_MBS} \
+    ppo.gen.temperature=0.6 \
+    ppo.kl_ctl=${KL_CTL} \
+    ppo.value_eps_clip=0.2 \
+    ppo.reward_output_scaling=5 \
+    ppo.reward_output_bias=0.0 \
+    ppo.adv_norm=True ppo.value_norm=True \
+    mask_too_long=False \
+    ppo.discount=1.0 \
+    actor.optimizer.lr=1e-6 \
+    critic.optimizer.lr=5e-6 \
+    actor.optimizer.lr_scheduler_type=constant \
+    actor_gen.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
+    ref_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
+    rew_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
+    critic_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
+    actor_train.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
+    critic_train.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
+    cache_clear_freq=1 \
+    n_nodes=${NODES} \
+    allocation_mode="'${ALLOCATION_MODE}'" n_gpus_per_node=8 \
+    recover_mode=auto \
+    recover_retries=10 \
+    torch_cache_mysophobia=True
--- a/functioncall/init.py
+++ b/functioncall/init.py
@ -0,0 +1 @@
+# Copyright 2025 Ant Group Inc.
--- a/functioncall/base/init.py
+++ b/functioncall/base/init.py
--- a/functioncall/base/call.py
+++ b/functioncall/base/call.py
@ -0,0 +1,146 @@
+import asyncio
+import logging
+import os
+import random
+import time
+import traceback
+from statistics import median
+from typing import Any, Dict
+
+import aiohttp
+
+from functioncall.base import logging
+
+logger = logging.getLogger("Functioncall")
+
+
+FUNCTION_CALLING_SERVICE_DOMAIN = os.getenv(
+    "FUNCTION_CALLING_SERVICE_DOMAIN",
+    # "http://faas-api-service.hcsfaas-function-calling.svc.sa128.alipay.com:8080",
+    "http://110.75.255.101:8080",  # xvip
+)
+
+
+def calculate_percentile(elapsed_times, percentile):
+    sorted_times = sorted(elapsed_times)
+    index = int(len(sorted_times) * (percentile / 100))
+    return sorted_times[min(index, len(sorted_times) - 1)]
+
+
+async def async_invoke_function(
+    session: aiohttp.ClientSession,
+    function_name: str,
+    timeout: aiohttp.ClientTimeout,
+    payload: Dict[str, Any] = None,
+    max_retries: int = 3,
+    initial_retry_interval: float = 0.5,
+    max_retry_interval: float = 10.0,
+):
+    if payload is None:
+        payload = {}
+    url = f"{FUNCTION_CALLING_SERVICE_DOMAIN}/hapis/faas.hcs.io/v1/functions/{function_name}/invoke"
+    params = {"invocationType": "RequestResponse"}
+
+    retries = 0
+    while retries <= max_retries:
+        try:
+            async with session.post(
+                url,
+                params=params,
+                json=payload,
+                timeout=timeout,
+            ) as response:
+                if response.status != 200:
+                    text = await response.text()
+                    raise Exception(
+                        f"HTTP Error {response.status}: {text} : {response.headers}"
+                    )
+
+                try:
+                    result = await response.json()
+                    return result, response.headers
+                except aiohttp.ContentTypeError as e:
+                    raise Exception("Invalid JSON response") from e
+
+        except asyncio.TimeoutError as e:
+            logger.warning(
+                f"Request timeout after {timeout}s (attempt {retries + 1}/{max_retries}). "
+                f"URL: {url}, Headers: {session.headers}"
+            )
+
+        except Exception as e:
+            logger.error(f"Async invocation failed on attempt {retries + 1}:{str(e)}")
+
+        retries += 1
+        if retries > max_retries:
+            return None, None
+
+        # 指数退避 + 随机抖动
+        sleep_time = min(
+            initial_retry_interval * (2**retries) + random.uniform(0, 0.1),
+            max_retry_interval,
+        )
+        await asyncio.sleep(sleep_time)
+
+
+async def batch_function_call_async(
+    payload_list, function_name, timeout, concurrency=1000
+):
+    connector = aiohttp.TCPConnector(limit=0)
+    async with aiohttp.ClientSession(connector=connector) as session:
+        semaphore = asyncio.Semaphore(concurrency)
+
+        async def limited_task(payload):
+            if not payload:
+                return None
+            async with semaphore:
+                st = time.monotonic()
+                result = await async_invoke_function(
+                    session, function_name, timeout, payload
+                )
+                return result, time.monotonic() - st
+
+        tasks = [limited_task(payload) for payload in payload_list]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        data_list = []
+        elapsed_times = []
+        max_elapsed = -1
+        max_elapsed_header = None
+        for (data, header), elapsed in results:
+            if elapsed > max_elapsed:
+                max_elapsed = elapsed
+                max_elapsed_header = header
+            data_list.append(data)
+            elapsed_times.append(elapsed)
+            # logger.debug(f"functioncall took {elapsed:.4f} seconds, header: {header}.)")
+
+        p50 = median(elapsed_times)
+        p90 = calculate_percentile(elapsed_times, 90)
+        p99 = calculate_percentile(elapsed_times, 99)
+        logger.info(
+            f"Longest functioncall took {max_elapsed:.4f} seconds, header: {max_elapsed_header}, p50: {p50}, p90: {p90}, p99: {p99}"
+        )
+
+        return data_list
+
+
+def get_function_name(runtime_type):
+    if runtime_type == "python_code":
+        return "realhf_code_verify"
+    elif runtime_type == "python_math":
+        return "realhf_math_verify"
+    return "empty_code"
+
+
+def batch_function_call(payload_list, runtime_type, timeout=30):
+    start_time = time.time()
+    function_name = get_function_name(runtime_type)
+    result = asyncio.run(
+        batch_function_call_async(payload_list, function_name, timeout)
+    )
+    execution_time = time.time() - start_time
+    logger.debug(
+        f"Batch function call done, runtime type: {runtime_type}, batch size: {len(payload_list)}, cost: {execution_time * 1000:.0f} ms"
+    )
+    return result
--- a/functioncall/base/logging.py
+++ b/functioncall/base/logging.py
@ -0,0 +1,30 @@
+import logging
+from typing import Optional
+
+import colorlog
+
+LOG_FORMAT = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s: %(message)s"
+DATE_FORMAT = "%Y%m%d-%H:%M:%S"
+LOGLEVEL = logging.DEBUG
+
+formatter = colorlog.ColoredFormatter(
+    fmt="%(log_color)s" + LOG_FORMAT,
+    datefmt=DATE_FORMAT,
+    log_colors={
+        "DEBUG": "blue",
+        "INFO": "light_purple",
+        "WARNING": "yellow",
+        "ERROR": "red",
+        "CRITICAL": "bold_white,bg_red",
+    },
+)
+
+handler = logging.StreamHandler()
+handler.setLevel(LOGLEVEL)
+handler.setFormatter(formatter)
+
+logging.basicConfig(level=LOGLEVEL, handlers=[handler])
+
+
+def getLogger(name: Optional[str] = None):
+    return logging.getLogger(name)
--- a/functioncall/code/init.py
+++ b/functioncall/code/init.py
--- a/functioncall/code/function/handler.py
+++ b/functioncall/code/function/handler.py
@ -0,0 +1,29 @@
+import json
+
+from testing_util import run_test
+
+
+def handle(event, context):
+    problem = event.get("problem", "")
+    code = event.get("code", "")
+    debug = event.get("debug", False)
+    timeout = event.get("timeout", 6)
+
+    if isinstance(problem, str):
+        problem = json.loads(problem)
+
+    # print(f"problem:{problem}, code:{code}, debug: {debug}\n")
+    result, metadata = run_test(problem, test=code, debug=debug, timeout=timeout)
+    return {"result": result, "metadata": metadata}
+
+
+# handle(
+#     {
+#         "problem": {
+#             "input_output": '{"inputs": ["1\\n", "2\\n"], "outputs": ["1\\n", "2\\n"]}'
+#         },
+#         "code": "s = input()\nprint(s)\n",
+#         "debug": True,
+#     },
+#     None,
+# )
--- a/functioncall/code/function/testing_util.py
+++ b/functioncall/code/function/testing_util.py
@ -0,0 +1,771 @@
+import ast
+import faulthandler
+import json
+import platform
+
+# to run the solution files we're using a timing based approach
+import signal
+import sys
+import types
+
+# used for debugging to time steps
+from datetime import datetime
+from enum import Enum
+
+# for capturing the stdout
+from io import StringIO
+
+# used for testing the code that reads from input
+from unittest.mock import mock_open, patch
+
+import numpy as np
+
+
+# Note: adapted from https://github.com/LiveCodeBench/LiveCodeBench
+class RuntimeModule:
+    @staticmethod
+    def from_string(name, _, code):
+        module = types.ModuleType(name)
+        exec(code, module.__dict__)
+        return module
+
+
+def truncatefn(s, length=300):
+    assert isinstance(s, str)
+    if len(s) <= length:
+        return s
+
+    return s[: length // 2] + "...(truncated) ..." + s[-length // 2 :]
+
+
+class CODE_TYPE(Enum):
+    call_based = 0
+    standard_input = 1
+
+
+# stuff for setting up signal timer
+class TimeoutException(Exception):
+    pass
+
+
+def timeout_handler(signum, frame):
+    print("alarm went off")
+    # return
+    raise TimeoutException
+
+
+signal.signal(signal.SIGALRM, timeout_handler)
+# timeout = 6  # seconds
+
+
+# used to capture stdout as a list
+# from https://stackoverflow.com/a/16571630/6416660
+# alternative use redirect_stdout() from contextlib
+class Capturing(list):
+    def __enter__(self):
+        self._stdout = sys.stdout
+        sys.stdout = self._stringio = StringIO()
+        # Make closing the StringIO a no-op
+        self._stringio.close = lambda x: 1
+        return self
+
+    def __exit__(self, *args):
+        self.append(self._stringio.getvalue())
+        del self._stringio  # free up some memory
+        sys.stdout = self._stdout
+
+
+def only_int_check(val):
+    return isinstance(val, int)
+
+
+def string_int_check(val):
+    return isinstance(val, str) and val.isdigit()
+
+
+def combined_int_check(val):
+    return only_int_check(val) or string_int_check(val)
+
+
+def run_test(sample, test=None, debug=False, timeout=6):
+    """
+    if test(generated_code) is not None it'll try to run the code.
+    otherwise it'll just return an input and output pair.
+    """
+    # Disable functionalities that can make destructive changes to the test.
+    reliability_guard()
+
+    if debug:
+        print(f"start = {datetime.now().time()}")
+
+    try:
+        in_outs = json.loads(sample["input_output"])
+    except ValueError:
+        in_outs = None
+
+    which_type = CODE_TYPE.call_based
+    if in_outs:
+        if in_outs.get("fn_name") is None:
+            which_type = CODE_TYPE.standard_input  # Standard input
+            method_name = None
+        else:
+            which_type = CODE_TYPE.call_based  # Call-based
+            method_name = in_outs["fn_name"]
+
+    if debug:
+        print(f"loaded input_output = {datetime.now().time()}")
+
+    if test is None:
+        assert False, "should not happen: test code is none"
+        return in_outs, {"error": "no test code provided"}
+    elif test is not None:
+        results = []
+        sol = "from string import *\nfrom re import *\nfrom datetime import *\nfrom collections import *\nfrom heapq import *\nfrom bisect import *\nfrom copy import *\nfrom math import *\nfrom random import *\nfrom statistics import *\nfrom itertools import *\nfrom functools import *\nfrom operator import *\nfrom io import *\nfrom sys import *\nfrom json import *\nfrom builtins import *\nfrom typing import *\nimport string\nimport re\nimport datetime\nimport collections\nimport heapq\nimport bisect\nimport copy\nimport math\nimport random\nimport statistics\nimport itertools\nimport functools\nimport operator\nimport io\nimport sys\nimport json\nsys.setrecursionlimit(6*10**5)\n"
+        if debug:
+            print(f"loading test code = {datetime.now().time()}")
+
+        if which_type == CODE_TYPE.call_based:
+
+            sol += test
+            if debug:
+                print(f"sol = {sol}")
+            signal.alarm(timeout)
+            try:
+                tmp_sol = RuntimeModule.from_string("tmp_sol", "", sol)
+                if "class Solution" not in test:
+                    tmp = tmp_sol
+                else:
+                    tmp = tmp_sol.Solution()
+                signal.alarm(0)
+            except Exception as e:
+                signal.alarm(0)
+                if debug:
+                    print(f"type 0 compilation error = {e}")
+                results.append(-2)
+                return results, {
+                    "error": repr(e),
+                    "error_code": -1,
+                    "error_message": "Compilation Error",
+                }
+            signal.alarm(0)
+
+        elif which_type == CODE_TYPE.standard_input:
+            # sol
+            # if code has if __name__ == "__main__": then remove it
+            try:
+                astree = ast.parse(test)
+                last_block = astree.body[-1]
+                if isinstance(last_block, ast.If):
+                    condition = last_block.test
+                    if ast.unparse(condition).strip() == "__name__ == '__main__'":
+                        test = (
+                            ast.unparse(astree.body[:-1])
+                            + "\n"
+                            + ast.unparse(last_block.body)
+                        )
+            except:
+                pass
+
+            tmp_test = test.split("\n")
+
+            new_test = []
+            for x in tmp_test:
+                if (not x.startswith("from ")) and (not x.startswith("import ")):
+                    new_test.append("\t" + x + "\n")
+                else:
+                    new_test.append(x + "\n")
+            tmp_test = new_test
+
+            new_test = ""
+            started = False
+            for i in tmp_test:
+                if i.startswith("\t") and not started:
+                    new_test += "stdin = sys.stdin\nstdout = sys.stdout\n"
+                    new_test += "def code():\n"
+                    new_test += i
+                    started = True
+                elif started and ((i.startswith("from ")) or (i.startswith("import "))):
+                    new_test += "\t" + i
+                else:
+                    new_test += i
+            tmp_test = new_test
+
+            sol += tmp_test
+            if debug:
+                print(f"sol = {sol}")
+            method_name = "code"
+            signal.alarm(timeout)
+            try:
+                tmp_sol = RuntimeModule.from_string("tmp_sol", "", sol)
+                tmp = tmp_sol
+                signal.alarm(0)
+            except Exception as e:
+                signal.alarm(0)
+                if debug:
+                    print(f"type 1 compilation error = {e}")
+                results.append(-2)
+                return results, {
+                    "error": repr(e),
+                    "error_code": -1,
+                    "error_message": "Compilation Error",
+                }
+            signal.alarm(0)
+        if debug:
+            print(f"get method = {datetime.now().time()}")
+
+        try:
+            method = getattr(tmp, method_name)  # get_attr second arg must be str
+        except:
+            signal.alarm(0)
+            e = sys.exc_info()
+            print(f"unable to get function error = {e}")
+            results.append(-2)
+            return results, {
+                "error": repr(e),
+                "error_code": -1,
+                "error_message": "Unable to extract code",
+            }
+
+        for index, inputs in enumerate(in_outs["inputs"]):
+            raw_inputs = inputs
+            raw_outputs = in_outs["outputs"][index]
+            if which_type == CODE_TYPE.call_based:
+                inputs = [json.loads(line) for line in inputs.split("\n")]
+                in_outs["outputs"][index] = json.loads(in_outs["outputs"][index])
+
+                truncate_line_size = 300 // (raw_inputs.count("\n") + 1)
+                raw_inputs = "\n".join(
+                    [
+                        truncatefn(line, truncate_line_size)
+                        for line in raw_inputs.strip().split("\n")
+                    ]
+                )
+                raw_outputs = truncatefn(raw_outputs, 200)
+            else:
+                raw_inputs = truncatefn(raw_inputs)
+                raw_outputs = truncatefn(raw_outputs, 200)
+            # JSON forces dictionaries to have string keys; this undoes this (assuming a singleton list)
+            try:
+                if isinstance(inputs[0], dict):
+                    inputs = [{int(k): v for k, v in inputs[0].items()}]
+            except:
+                True
+            try:
+                if isinstance(in_outs["outputs"][index], dict):
+                    in_outs["outputs"][index] = [
+                        {int(k): v for k, v in in_outs["outputs"][index].items()}
+                    ]
+            except:
+                True
+            try:
+                if isinstance(in_outs["outputs"][index][0], dict):
+                    in_outs["outputs"][index] = [
+                        {int(k): v for k, v in in_outs["outputs"][index][0].items()}
+                    ]
+            except:
+                True
+
+            if debug:
+                print(
+                    f"time: {datetime.now().time()} testing index = {index}  inputs = {inputs}, {type(inputs)}. type = {which_type}"
+                )
+            if which_type == CODE_TYPE.call_based:  # Call-based
+                signal.alarm(timeout)
+                faulthandler.enable()
+                try:
+                    output = method(*inputs)
+                    raw_true_output = output
+
+                    raw_true_output_copy = json.dumps(output)
+                    raw_true_output_copy = truncatefn(raw_true_output_copy, 200)
+
+                    # ground truth sequences are not tuples
+                    if isinstance(output, tuple):
+                        output = list(output)
+
+                    tmp_result = output == in_outs["outputs"][index]
+                    if (
+                        isinstance(in_outs["outputs"][index], list)
+                        and in_outs["outputs"][index]
+                    ):
+                        tmp_result = tmp_result or (
+                            output == in_outs["outputs"][index][0]
+                        )
+
+                    # ground truth sequences are not tuples
+                    try:
+                        if isinstance(output[0], tuple):
+                            tmp_result = tmp_result or (
+                                [list(x) for x in output]
+                                == in_outs["outputs"][index][0]
+                            )
+                    except:
+                        True
+                    results.append(tmp_result)
+                    if tmp_result != True:
+                        return results, {
+                            "output": raw_true_output_copy,
+                            "expected": raw_outputs,
+                            "inputs": raw_inputs,
+                            "error_code": -2,
+                            "error_message": "Wrong Answer",
+                        }
+                    # reset the alarm
+                    signal.alarm(0)
+                except Exception as e:
+                    signal.alarm(0)
+                    faulthandler.disable()
+                    if debug:
+                        print(
+                            f"Standard input runtime error or time limit exceeded error = {e}"
+                        )
+                    results.append(-1)
+                    if "timeoutexception" in repr(e).lower():
+                        return results, {
+                            "error": repr(e),
+                            "error_code": -3,
+                            "error_message": "Time Limit Exceeded",
+                            "inputs": raw_inputs,
+                            "expected": raw_outputs,
+                        }
+                    else:
+                        return results, {
+                            "error": repr(e),
+                            "error_code": -4,
+                            "error_message": "Runtime Error",
+                            "inputs": raw_inputs,
+                            "expected": raw_outputs,
+                        }
+                faulthandler.disable()
+                signal.alarm(0)
+                if debug:
+                    print(
+                        f"outputs = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
+                    )
+            elif which_type == CODE_TYPE.standard_input:  # Standard input
+                faulthandler.enable()
+                passed = False
+
+                if isinstance(inputs, list):
+                    inputs = "\n".join(inputs)
+                if isinstance(in_outs["outputs"][index], list):
+                    in_outs["outputs"][index] = "\n".join(in_outs["outputs"][index])
+
+                signal.alarm(timeout)
+                with Capturing() as output:
+                    try:
+                        call_method(method, inputs)
+                        # reset the alarm
+                        signal.alarm(0)
+                        passed = True
+                    except Exception as e:
+                        # runtime error or took too long
+                        signal.alarm(0)
+                        print(
+                            f"Call-based runtime error or time limit exceeded error = {repr(e)}{e}"
+                        )
+                        results.append(-1)
+                        if "timeoutexception" in repr(e).lower():
+                            return results, {
+                                "error": repr(e),
+                                "error_code": -3,
+                                "error_message": "Time Limit Exceeded",
+                                "inputs": raw_inputs,
+                                "expected": raw_outputs,
+                            }
+                        else:
+                            return results, {
+                                "error": repr(e),
+                                "error_code": -4,
+                                "error_message": "Runtime Error",
+                                "inputs": raw_inputs,
+                                "expected": raw_outputs,
+                            }
+                    signal.alarm(0)
+                raw_true_output = output[0]
+                raw_true_output_copy = truncatefn(raw_true_output, 200)
+                output = raw_true_output.splitlines()
+                if not passed:
+                    if debug:
+                        nl = "\n"
+                        if not isinstance(inputs, list):
+                            print(
+                                f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
+                            )
+                        else:
+                            print(
+                                f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
+                            )
+                    continue
+
+                if passed and debug:
+                    print(
+                        f"==> output = {output}, test outputs = {in_outs['outputs'][index]}"
+                    )
+
+                if custom_compare_(output, in_outs["outputs"][index]):
+                    tmp_result = True
+                    results.append(tmp_result)
+                    continue
+
+                # ground truth sequences are expressed as lists not tuples
+                if isinstance(output, tuple):
+                    output = list(output)
+
+                tmp_result = False
+                try:
+                    tmp_result = output == [in_outs["outputs"][index]]
+                    if isinstance(in_outs["outputs"][index], list):
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index])
+                        if isinstance(output[0], str):
+                            tmp_result = tmp_result or (
+                                [e.strip() for e in output] == in_outs["outputs"][index]
+                            )
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check1 exception = {e}")
+                    pass
+
+                if tmp_result == True:
+                    results.append(tmp_result)
+                    continue
+
+                # try one more time without \n
+                if isinstance(in_outs["outputs"][index], list):
+                    for tmp_index, i in enumerate(in_outs["outputs"][index]):
+                        in_outs["outputs"][index][tmp_index] = i.split("\n")
+                        in_outs["outputs"][index][tmp_index] = [
+                            x.strip() for x in in_outs["outputs"][index][tmp_index] if x
+                        ]
+                else:
+                    in_outs["outputs"][index] = in_outs["outputs"][index].split("\n")
+                    in_outs["outputs"][index] = list(
+                        filter(len, in_outs["outputs"][index])
+                    )
+                    in_outs["outputs"][index] = list(
+                        map(lambda x: x.strip(), in_outs["outputs"][index])
+                    )
+
+                try:
+                    tmp_result = output == [in_outs["outputs"][index]]
+                    if isinstance(in_outs["outputs"][index], list):
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index])
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check2 exception = {e}")
+                    pass
+
+                if tmp_result == True:
+                    results.append(tmp_result)
+                    continue
+
+                # try by converting the output into a split up list too
+                if isinstance(output, list):
+                    output = list(filter(len, output))
+
+                if debug:
+                    nl = "\n"
+                    if not isinstance(inputs, list):
+                        print(
+                            f"@1 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]} {tmp_result=}"
+                        )
+                    else:
+                        print(
+                            f"@1 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]} {tmp_result=}"
+                        )
+
+                if tmp_result == True:
+                    results.append(tmp_result)
+                    continue
+
+                if debug:
+                    print(f"{tmp_result=} @a")
+
+                try:
+                    tmp_result = output == [in_outs["outputs"][index]]
+                    if isinstance(in_outs["outputs"][index], list):
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index])
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check3 exception = {e}")
+                    pass
+
+                if debug:
+                    print(f"{tmp_result=} @b")
+
+                try:
+                    all_ints = all(
+                        combined_int_check(e1) and combined_int_check(e2)
+                        for e1, e2 in zip(output, in_outs["outputs"][index])
+                    )
+                    if not all_ints:
+                        if debug:
+                            print(
+                                [
+                                    combined_int_check(e1) and combined_int_check(e2)
+                                    for e1, e2 in zip(output, in_outs["outputs"][index])
+                                ]
+                            )
+                        output_float = [float(e) for e in output]
+                        gt_float = [float(e) for e in in_outs["outputs"][index]]
+                        tmp_result = tmp_result or (
+                            (len(output_float) == len(gt_float))
+                            and np.allclose(output_float, gt_float)
+                        )
+                except Exception as e:
+                    pass
+
+                if debug:
+                    print(f"{tmp_result=} @c")
+
+                try:
+                    if isinstance(output[0], list):
+                        all_ints = all(
+                            combined_int_check(e1) and combined_int_check(e2)
+                            for e1, e2 in zip(output[0], in_outs["outputs"][index])
+                        )
+                        if not all_ints:
+                            output_float = [float(e) for e in output[0]]
+                            gt_float = [float(e) for e in in_outs["outputs"][index][0]]
+                            tmp_result = tmp_result or (
+                                (len(output_float) == len(gt_float))
+                                and np.allclose(output_float, gt_float)
+                            )
+                except Exception as e:
+                    pass
+
+                if tmp_result == True:
+                    results.append(tmp_result)
+                    continue
+
+                if debug:
+                    print(f"{tmp_result=} @d")
+                # try by converting the stuff into split up list
+                if isinstance(in_outs["outputs"][index], list):
+                    for tmp_index, i in enumerate(in_outs["outputs"][index]):
+                        in_outs["outputs"][index][tmp_index] = set(i.split())
+                else:
+                    in_outs["outputs"][index] = set(in_outs["outputs"][index].split())
+
+                if debug:
+                    print(f"{tmp_result=} @e")
+
+                try:
+                    tmp_result = output == in_outs["outputs"][index]
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check4 exception = {e}")
+                    continue
+
+                if tmp_result == True:
+                    results.append(tmp_result)
+                    continue
+
+                if debug:
+                    print(f"{tmp_result=} @f")
+
+                # try by converting the output into a split up list too
+                if isinstance(output, list):
+                    for tmp_index, i in enumerate(output):
+                        output[tmp_index] = i.split()
+                    output = list(filter(len, output))
+                    for tmp_index, i in enumerate(output):
+                        output[tmp_index] = set(i)
+                else:
+                    output = output.split()
+                    output = list(filter(len, output))
+                    output = set(output)
+
+                if debug:
+                    print(f"{tmp_result=} @g")
+                # try:
+                #     tmp_result = set(frozenset(s) for s in output) == set(
+                #         frozenset(s) for s in in_outs["outputs"][index]
+                #     )
+                # except Exception as e:
+                #     if debug:
+                #         print(f"Failed check5 exception = {e}")
+
+                # if they are all numbers, round so that similar numbers are treated as identical
+                # try:
+                #     all_ints = all(
+                #         combined_int_check(e1) and combined_int_check(e2)
+                #         for e1, e2 in zip(output, in_outs["outputs"][index])
+                #     )
+                #     tmp_result = tmp_result or (
+                #         set(frozenset(round(float(t), 3) for t in s) for s in output)
+                #         == set(
+                #             frozenset(round(float(t), 3) for t in s)
+                #             for s in in_outs["outputs"][index]
+                #         )
+                #     )
+                # except Exception as e:
+                #     if debug:
+                #         print(f"Failed check6 exception = {e}")
+
+                if debug:
+                    print(f"{tmp_result=} @h")
+
+                if tmp_result == True and debug:
+                    print("PASSED")
+
+                results.append(tmp_result)
+                if tmp_result != True:
+                    return results, {
+                        "output": raw_true_output_copy,
+                        "expected": raw_outputs,
+                        "inputs": raw_inputs,
+                        "error_code": -2,
+                        "error_message": "Wrong Answer",
+                    }
+
+                if debug:
+                    nl = "\n"
+                    if not isinstance(inputs, list):
+                        print(
+                            f"@2 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
+                        )
+                    else:
+                        print(
+                            f"@2 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
+                        )
+
+                    print(f"results = {results}")
+
+    return results, {}
+
+
+def custom_compare_(output, ground_truth):
+
+    if isinstance(output, list):
+        output_1 = "\n".join(output)
+        if stripped_string_compare(output_1, ground_truth):
+            return True
+
+    if isinstance(output, list):
+        output_2 = [o.lstrip().rstrip() for o in output]
+        output_2 = "\n".join(output_2)
+        if stripped_string_compare(output_2, ground_truth):
+            return True
+
+    return False
+
+
+def stripped_string_compare(s1, s2):
+    s1 = s1.lstrip().rstrip()
+    s2 = s2.lstrip().rstrip()
+    return s1 == s2
+
+
+def call_method(method, inputs):
+
+    if isinstance(inputs, list):
+        inputs = "\n".join(inputs)
+
+    inputs_line_iterator = iter(inputs.split("\n"))
+
+    # sys.setrecursionlimit(10000)
+
+    # @patch('builtins.input', side_effect=inputs.split("\n"))
+    @patch("builtins.open", mock_open(read_data=inputs))
+    @patch("sys.stdin", StringIO(inputs))
+    @patch("sys.stdin.readline", lambda *args: next(inputs_line_iterator))
+    @patch("sys.stdin.readlines", lambda *args: inputs.split("\n"))
+    @patch("sys.stdin.read", lambda *args: inputs)
+    # @patch('sys.stdout.write', print)
+    def _inner_call_method(_method):
+        try:
+            return _method()
+        except SystemExit as e:
+            pass
+        finally:
+            pass
+
+    return _inner_call_method(method)
+
+
+def reliability_guard(maximum_memory_bytes=None):
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+
+    if maximum_memory_bytes is not None:
+        import resource
+
+        resource.setrlimit(
+            resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)
+        )
+        resource.setrlimit(
+            resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)
+        )
+        if not platform.uname().system == "Darwin":
+            resource.setrlimit(
+                resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)
+            )
+
+    faulthandler.disable()
+
+    import builtins
+
+    builtins.exit = None
+    builtins.quit = None
+
+    import os
+
+    if os.putenv:
+        os.environ["OMP_NUM_THREADS"] = "20"
+
+    os.kill = None
+    os.system = None
+    os.putenv = None
+    os.remove = None
+    os.removedirs = None
+    os.rmdir = None
+    os.fchdir = None
+    os.setuid = None
+    os.fork = None
+    os.forkpty = None
+    os.killpg = None
+    os.rename = None
+    os.renames = None
+    os.truncate = None
+    os.replace = None
+    os.unlink = None
+    os.fchmod = None
+    os.fchown = None
+    os.chmod = None
+    os.chown = None
+    os.chroot = None
+    os.fchdir = None
+    os.lchflags = None
+    os.lchmod = None
+    os.lchown = None
+    os.getcwd = None
+    os.chdir = None
+
+    import shutil
+
+    shutil.rmtree = None
+    shutil.move = None
+    shutil.chown = None
+
+    import subprocess
+
+    subprocess.Popen = None  # type: ignore
+
+    __builtins__["help"] = None
+
+    import sys
+
+    sys.modules["ipdb"] = None
+    sys.modules["joblib"] = None
+    sys.modules["resource"] = None
+    sys.modules["psutil"] = None
+    sys.modules["tkinter"] = None
--- a/functioncall/code/local_verify.py
+++ b/functioncall/code/local_verify.py
@ -0,0 +1,194 @@
+import json
+import multiprocessing
+import os
+import sys
+import time
+import traceback
+from io import StringIO
+
+from function.testing_util import run_test
+
+from functioncall.base import logging
+
+logger = logging.getLogger("Functioncall")
+
+
+def try_load_json(data):
+    """
+    Attempts to load the given data as JSON. If successful, returns the parsed JSON.
+    Otherwise, returns None and logs an error.
+    """
+    try:
+        loaded_data = json.loads(data)
+        return loaded_data
+    except json.JSONDecodeError as e:
+        logger.info(f"Failed to load JSON: {e}")
+        return data
+
+
+def capture_stdout(code):
+    original_stdout = sys.stdout
+    fake_stdout = StringIO()
+
+    try:
+        sys.stdout = fake_stdout  # 重定向输出
+        exec(code, {"__builtins__": __builtins__})  # 在隔离环境中执行
+    except Exception as e:
+        return f"error: {str(e)}, traceback: {traceback.format_exc()}"
+    finally:
+        sys.stdout = original_stdout  # 恢复原stdout
+    return fake_stdout.getvalue()
+
+
+def _temp_run(problem, generation, debug, result):
+    start_time = time.time()
+
+    try:
+        if debug:
+            logger.debug(f"Running test for problem: {problem}")
+        result.append(run_test(sample=problem, test=generation, debug=debug))
+        if debug:
+            logger.debug(f"Test completed with result: {result}")
+    except Exception as e:
+        if debug:
+            logger.error(f"Error in _temp_run: {e}, problem:{problem}", exe_info=True)
+
+    execution_time = time.time() - start_time
+    logger.info(
+        f'[_temp_run] query_id: {problem["problem_id"]}, start_time: {str(start_time)}, Time elapsed: {execution_time * 1000:.0f} ms'
+    )
+
+
+def check_correctness(problem, generation, timeout, debug=False):
+    """Check correctness of code generation with a global timeout.
+    The global timeout is to catch some extreme/rare cases not handled by the timeouts
+    inside `run_test`"""
+    if debug:
+        result = capture_stdout(
+            "from function.testing_util import run_test\n"
+            + "run_test(sample=problem, test=generation, debug=debug)"
+        )
+        return result[0], result[1]
+
+    start_time = time.time()
+    manager = multiprocessing.Manager()
+    result = manager.list()
+    p = multiprocessing.Process(
+        target=_temp_run, args=(problem, generation, debug, result)
+    )
+    p.start()
+    p.join(timeout=timeout + 1)
+    if p.is_alive():
+        if debug:
+            logger.debug(f"Process is still alive. Killing the process.")
+        p.kill()
+    if not result:
+        # Remark: ideally we would consider that all tests failed but we can't access number of tests here easily
+        # so we use 21=the average number of tests for a smaple in the test split instead
+        avg_number_tests = 21
+        result = [[-1] * avg_number_tests]
+        if debug:
+            logger.debug(f"Global timeout occurred, returning default result.")
+    if debug:
+        logger.debug(f"Final result: {result}")
+
+    execution_time = time.time() - start_time
+    logger.info(
+        f'[check_correctness] query_id: {problem["problem_id"]}, start_time: {str(start_time)}, Time elapsed: {execution_time * 1000:.0f} ms'
+    )
+    return result[0]
+
+
+def load_problems(path):
+    problem_map = {}
+    for idx, line in enumerate(open(path, "rb")):
+        if line is not None:
+            line = line.strip().decode("utf-8")
+            row = json.loads(line)
+
+            query_id = str(row["id"])
+            problem_map[query_id] = {
+                "problem_id": query_id,
+                # "question": row["question"],
+                "solutions": row["solutions"],
+                "input_output": row["input_output"],
+                # "difficulty": level,
+                # "url": row["url"],
+                # "starter_code": row["starter_code"],
+            }
+
+    return problem_map
+
+
+global_problems = load_problems(
+    os.getenv(
+        "REAL_CODE_METADATA_PATH",
+        "/storage/datasets/codeparrot-apps-test.jsonl",
+    )
+)
+
+
+def code_verify(generateds, query_ids, debug=False):
+    assert len(generateds) == len(query_ids), (
+        len(generateds),
+        len(query_ids),
+    )
+
+    result = []
+    global global_problems
+
+    for idx, query_id in enumerate(query_ids):
+        if query_id not in global_problems:
+            continue
+
+        problem = global_problems[query_id]
+        test_code = generateds[idx]
+        logger.debug(f"run_batch_code, query_id: {query_id}")
+
+        try:
+            curr_res, metadata = check_correctness(
+                problem=problem, generation=test_code, timeout=6000, debug=debug
+            )
+
+            if any(x != True for x in curr_res):
+                logger.debug(
+                    f'id:{problem["problem_id"]}, Results were not all True: {metadata}'
+                )
+                result.append(f"{query_id} failed")
+            else:
+                # print(f"id:{problem["problem_id"]}, result : {curr_res}")
+                result.append(f"{query_id} success")
+
+        except Exception as e:
+            logger.error(f"test framework exception = {repr(e)}{e}\n", exe_info=True)
+            result.append(f"{query_id} failed")
+            break
+        finally:
+            # print(f"id:{problem["problem_id"]}, result : {curr_res}")
+            # assert isinstance(curr_res, list)
+            pass
+
+    return result
+
+
+if __name__ == "__main__":
+
+    def create_test_params(index_list):
+        global global_problems
+        codes, query_ids = [], []
+
+        for index in index_list:
+            if str(index) not in global_problems:
+                continue
+
+            problem = global_problems[str(index)]
+            if "solutions" not in problem or not problem["solutions"]:
+                continue
+
+            codes.append(try_load_json(problem["solutions"])[0])
+            query_ids.append(str(index))
+        return codes, query_ids
+
+    codes, query_ids = create_test_params(list(range(805, 806)))
+    result = code_verify(codes, query_ids, True)
+    print(result)
--- a/functioncall/code/verify.py
+++ b/functioncall/code/verify.py
@ -0,0 +1,152 @@
+import json
+import os
+from collections import defaultdict
+
+from functioncall.base import logging
+from functioncall.base.call import batch_function_call
+
+logger = logging.getLogger("Functioncall")
+
+
+def try_load_json(data):
+    """
+    Attempts to load the given data as JSON. If successful, returns the parsed JSON.
+    Otherwise, returns None and logs an error.
+    """
+    try:
+        loaded_data = json.loads(data)
+        return loaded_data
+    except json.JSONDecodeError as e:
+        # print(f"Failed to load JSON: {e}")
+        return data
+
+
+def load_problems_with_testcase_batch(path, debug=False, test_case_batch_size=None):
+    problem_map = defaultdict(list)
+    for idx, line in enumerate(open(path, "rb")):
+        if line is None:
+            continue
+
+        # parse one problem
+        row = json.loads(line.strip().decode("utf-8"))
+        query_id = str(row["id"])
+        input_output = json.loads(row["input_output"]) if "input_output" in row else {}
+        inputs = input_output.get("inputs", [])
+        outputs = input_output.get("outputs", [])
+        assert len(inputs) == len(
+            outputs
+        ), f"Inputs({len(inputs)}) and outputs({len(outputs)}) mismatch for {query_id}"
+
+        other_io_fields = {
+            k: v for k, v in input_output.items() if k not in ["inputs", "outputs"]
+        }
+        # create batches for testcases
+        if not test_case_batch_size or test_case_batch_size <= 0:
+            test_case_batch_size = len(inputs)
+
+        for batch_idx in range(0, len(inputs), test_case_batch_size):
+            batch_io = {
+                **other_io_fields,
+                "inputs": inputs[batch_idx : batch_idx + test_case_batch_size],
+                "outputs": outputs[batch_idx : batch_idx + test_case_batch_size],
+            }
+
+            sub_problem = {
+                "problem_id": query_id,
+                "input_output": json.dumps(batch_io),
+                "batche_index": batch_idx,
+            }
+            if debug:
+                sub_problem["solutions"] = row.get("solutions", [])
+            problem_map[query_id].append(sub_problem)
+
+    return problem_map
+
+
+global_problems = load_problems_with_testcase_batch(
+    os.getenv(
+        "REAL_CODE_METADATA_PATH",
+        "/storage/datasets/codeparrot-apps-test.jsonl",
+    ),
+    debug=True,
+    test_case_batch_size=20,
+)
+
+
+def code_verify(generateds, query_ids, debug=False, timeout=20, timeout_for_testcase=6):
+    assert len(generateds) == len(query_ids), (
+        len(generateds),
+        len(query_ids),
+    )
+    payload_list = []
+
+    global global_problems
+    for idx, query_id in enumerate(query_ids):
+        if query_id not in global_problems:
+            payload_list.append(None)
+            logger.warning(
+                f"Invalid query id : {query_id}, type: {type(query_id)}, should be in problem dataset!"
+            )
+            continue
+
+        problems = global_problems[query_id]
+        for problem in problems:
+            payload_list.append(
+                {
+                    "problem": problem,
+                    "code": generateds[idx],
+                    "debug": debug,
+                    "timeout": timeout_for_testcase,
+                    "query_index": idx,
+                }
+            )
+
+    logger.debug(
+        f"code_verify, payload_list size: {len(payload_list)}, query size: {len(query_ids)}, query_id_0: {query_ids[0]}"
+    )
+    rsp_list = batch_function_call(payload_list, "python_code", timeout=timeout)
+
+    results = [1] * len(query_ids)
+    for idx, rsp in enumerate(rsp_list):
+        query_index = payload_list[idx]["query_index"]
+        query_id = query_ids[query_index]
+
+        value = 0
+        if rsp and "result" in rsp and not any(x != True for x in rsp["result"]):
+            value = 1
+
+        else:
+            logger.debug(
+                f"Functioncall code verify failed, query index: {query_index}, query id: {query_id}, results: {rsp}"
+            )
+
+        # logger.debug(f"query index: {idx}, value: {value}, results[query_index]: {results[query_index]}")
+
+        results[query_index] = results[query_index] and value
+
+    return results
+
+
+if __name__ == "__main__":
+
+    def create_test_params(count=10):
+        global global_problems
+        codes, query_ids = [], []
+        idx = 0
+        for query_id, problems in global_problems.items():
+            if idx >= count:
+                break
+
+            problem = problems[0]
+            if "solutions" not in problem or not problem["solutions"]:
+                continue
+
+            codes.append(try_load_json(problem["solutions"])[0])
+            query_ids.append(query_id)
+            idx += 1
+
+        return codes, query_ids
+
+    codes, query_ids = create_test_params(1000)
+    result = code_verify(codes, query_ids, True, 100)
+    print(result)
--- a/functioncall/math/init.py
+++ b/functioncall/math/init.py
--- a/functioncall/math/function/grader.py
+++ b/functioncall/math/function/grader.py
@ -0,0 +1,396 @@
+"""
+This logic is largely copied from the Hendrycks' MATH release (math_equivalence), and borrowed from:
+- https://github.com/microsoft/ProphetNet/tree/master/CRITIC
+- https://github.com/openai/prm800k
+- https://github.com/microsoft/ToRA/blob/main/src/eval/grader.py
+- https://github.com/deepseek-ai/DeepSeek-Math/blob/main/evaluation/eval/eval_utils.py
+"""
+
+import multiprocessing
+import re
+from collections import defaultdict
+from math import isclose
+from typing import Union
+
+import regex
+from latex2sympy2 import latex2sympy
+from sympy import N, simplify
+from sympy.parsing.latex import parse_latex
+from sympy.parsing.sympy_parser import parse_expr
+
+# from .parser import choice_answer_clean, strip_string
+# from parser import choice_answer_clean
+
+
+def choice_answer_clean(pred: str):
+    pred = pred.strip("\n").rstrip(".").rstrip("/").strip(" ").lstrip(":")
+    # Clean the answer based on the dataset
+    tmp = re.findall(r"\b(A|B|C|D|E)\b", pred.upper())
+    if tmp:
+        pred = tmp
+    else:
+        pred = [pred.strip().strip(".")]
+    pred = pred[-1]
+    # Remove the period at the end, again!
+    pred = pred.rstrip(".").rstrip("/")
+    return pred
+
+
+def parse_digits(num):
+    num = regex.sub(",", "", str(num))
+    try:
+        return float(num)
+    except:
+        if num.endswith("%"):
+            num = num[:-1]
+            if num.endswith("\\"):
+                num = num[:-1]
+            try:
+                return float(num) / 100
+            except:
+                pass
+    return None
+
+
+def is_digit(num):
+    # paired with parse_digits
+    return parse_digits(num) is not None
+
+
+def str_to_pmatrix(input_str):
+    input_str = input_str.strip()
+    matrix_str = re.findall(r"\{.*,.*\}", input_str)
+    pmatrix_list = []
+
+    for m in matrix_str:
+        m = m.strip("{}")
+        pmatrix = r"\begin{pmatrix}" + m.replace(",", "\\") + r"\end{pmatrix}"
+        pmatrix_list.append(pmatrix)
+
+    return ", ".join(pmatrix_list)
+
+
+def math_equal(
+    prediction: Union[bool, float, str],
+    reference: Union[float, str],
+    include_percentage: bool = True,
+    is_close: bool = True,
+    timeout: bool = False,
+) -> bool:
+    """
+    Exact match of math if and only if:
+    1. numerical equal: both can convert to float and are equal
+    2. symbolic equal: both can convert to sympy expression and are equal
+    """
+    # print("Judge:", prediction, reference)
+    if prediction is None or reference is None:
+        return False
+    if str(prediction.strip().lower()) == str(reference.strip().lower()):
+        return True
+    if (
+        reference in ["A", "B", "C", "D", "E"]
+        and choice_answer_clean(prediction) == reference
+    ):
+        return True
+
+    try:  # 1. numerical equal
+        if is_digit(prediction) and is_digit(reference):
+            prediction = parse_digits(prediction)
+            reference = parse_digits(reference)
+            # number questions
+            if include_percentage:
+                gt_result = [reference / 100, reference, reference * 100]
+            else:
+                gt_result = [reference]
+            for item in gt_result:
+                try:
+                    if is_close:
+                        if numeric_equal(prediction, item):
+                            return True
+                    else:
+                        if item == prediction:
+                            return True
+                except Exception:
+                    continue
+            return False
+    except:
+        pass
+
+    if not prediction and prediction not in [0, False]:
+        return False
+
+    # 2. symbolic equal
+    reference = str(reference).strip()
+    prediction = str(prediction).strip()
+
+    ## pmatrix (amps)
+    if "pmatrix" in prediction and not "pmatrix" in reference:
+        reference = str_to_pmatrix(reference)
+
+    ## deal with [], (), {}
+    pred_str, ref_str = prediction, reference
+    if (
+        prediction.startswith("[")
+        and prediction.endswith("]")
+        and not reference.startswith("(")
+    ) or (
+        prediction.startswith("(")
+        and prediction.endswith(")")
+        and not reference.startswith("[")
+    ):
+        pred_str = pred_str.strip("[]()")
+        ref_str = ref_str.strip("[]()")
+    for s in ["{", "}", "(", ")"]:
+        ref_str = ref_str.replace(s, "")
+        pred_str = pred_str.replace(s, "")
+    if pred_str.lower() == ref_str.lower():
+        return True
+
+    ## [a, b] vs. [c, d], return a==c and b==d
+    if (
+        regex.match(r"(\(|\[).+(\)|\])", prediction) is not None
+        and regex.match(r"(\(|\[).+(\)|\])", reference) is not None
+    ):
+        pred_parts = prediction[1:-1].split(",")
+        ref_parts = reference[1:-1].split(",")
+        if len(pred_parts) == len(ref_parts):
+            if all(
+                [
+                    math_equal(
+                        pred_parts[i], ref_parts[i], include_percentage, is_close
+                    )
+                    for i in range(len(pred_parts))
+                ]
+            ):
+                return True
+    if (
+        (
+            prediction.startswith("\\begin{pmatrix}")
+            or prediction.startswith("\\begin{bmatrix}")
+        )
+        and (
+            prediction.endswith("\\end{pmatrix}")
+            or prediction.endswith("\\end{bmatrix}")
+        )
+        and (
+            reference.startswith("\\begin{pmatrix}")
+            or reference.startswith("\\begin{bmatrix}")
+        )
+        and (
+            reference.endswith("\\end{pmatrix}") or reference.endswith("\\end{bmatrix}")
+        )
+    ):
+        pred_lines = [
+            line.strip()
+            for line in prediction[
+                len("\\begin{pmatrix}") : -len("\\end{pmatrix}")
+            ].split("\\\\")
+            if line.strip()
+        ]
+        ref_lines = [
+            line.strip()
+            for line in reference[
+                len("\\begin{pmatrix}") : -len("\\end{pmatrix}")
+            ].split("\\\\")
+            if line.strip()
+        ]
+        matched = True
+        if len(pred_lines) == len(ref_lines):
+            for pred_line, ref_line in zip(pred_lines, ref_lines):
+                pred_parts = pred_line.split("&")
+                ref_parts = ref_line.split("&")
+                if len(pred_parts) == len(ref_parts):
+                    if not all(
+                        [
+                            math_equal(
+                                pred_parts[i],
+                                ref_parts[i],
+                                include_percentage,
+                                is_close,
+                            )
+                            for i in range(len(pred_parts))
+                        ]
+                    ):
+                        matched = False
+                        break
+                else:
+                    matched = False
+                if not matched:
+                    break
+        else:
+            matched = False
+        if matched:
+            return True
+
+    if prediction.count("=") == 1 and reference.count("=") == 1:
+        pred = prediction.split("=")
+        pred = f"{pred[0].strip()} - ({pred[1].strip()})"
+        ref = reference.split("=")
+        ref = f"{ref[0].strip()} - ({ref[1].strip()})"
+        if symbolic_equal(pred, ref) or symbolic_equal(f"-({pred})", ref):
+            return True
+    elif (
+        prediction.count("=") == 1
+        and len(prediction.split("=")[0].strip()) <= 2
+        and "=" not in reference
+    ):
+        if math_equal(
+            prediction.split("=")[1], reference, include_percentage, is_close
+        ):
+            return True
+    elif (
+        reference.count("=") == 1
+        and len(reference.split("=")[0].strip()) <= 2
+        and "=" not in prediction
+    ):
+        if math_equal(
+            prediction, reference.split("=")[1], include_percentage, is_close
+        ):
+            return True
+
+    # symbolic equal with sympy
+    if timeout:
+        if call_with_timeout(symbolic_equal_process, prediction, reference):
+            return True
+    else:
+        if symbolic_equal(prediction, reference):
+            return True
+
+    return False
+
+
+def math_equal_process(param):
+    return math_equal(param[-2], param[-1])
+
+
+def numeric_equal(prediction: float, reference: float):
+    # Note that relative tolerance has significant impact
+    # on the result of the synthesized GSM-Hard dataset
+    # if reference.is_integer():
+    #     return isclose(reference, round(prediction), abs_tol=1e-4)
+    # else:
+    # prediction = round(prediction, len(str(reference).split(".")[-1]))
+    return isclose(reference, prediction, rel_tol=1e-4)
+
+
+def symbolic_equal(a, b):
+    def _parse(s):
+        for f in [parse_latex, parse_expr, latex2sympy]:
+            try:
+                return f(s.replace("\\\\", "\\"))
+            except:
+                try:
+                    return f(s)
+                except:
+                    pass
+        return s
+
+    a = _parse(a)
+    b = _parse(b)
+
+    # direct equal
+    try:
+        if str(a) == str(b) or a == b:
+            return True
+    except:
+        pass
+
+    # simplify equal
+    try:
+        if a.equals(b) or simplify(a - b) == 0:
+            return True
+    except:
+        pass
+
+    # equation equal
+    try:
+        if (abs(a.lhs - a.rhs)).equals(abs(b.lhs - b.rhs)):
+            return True
+    except:
+        pass
+
+    try:
+        if numeric_equal(float(N(a)), float(N(b))):
+            return True
+    except:
+        pass
+
+    # matrix
+    try:
+        # if a and b are matrix
+        if a.shape == b.shape:
+            _a = a.applyfunc(lambda x: round(x, 3))
+            _b = b.applyfunc(lambda x: round(x, 3))
+            if _a.equals(_b):
+                return True
+    except:
+        pass
+
+    return False
+
+
+def symbolic_equal_process(a, b, output_queue):
+    result = symbolic_equal(a, b)
+    output_queue.put(result)
+
+
+def call_with_timeout(func, *args, timeout=3, **kwargs):
+    output_queue = multiprocessing.Queue()
+    process_args = args + (output_queue,)
+    process = multiprocessing.Process(target=func, args=process_args, kwargs=kwargs)
+    process.start()
+    process.join(timeout)
+
+    if process.is_alive():
+        process.terminate()
+        process.join()
+        return False
+
+    return output_queue.get()
+
+
+def _test_math_equal():
+    # print(math_equal("0.0833333333333333", "\\frac{1}{12}"))
+    # print(math_equal("(1,4.5)", "(1,\\frac{9}{2})"))
+    # print(math_equal("\\frac{x}{7}+\\frac{2}{7}", "\\frac{x+2}{7}", timeout=True))
+    # print(math_equal("\\sec^2(y)", "\\tan^2(y)+1", timeout=True))
+    # print(math_equal("\\begin{pmatrix}-\\frac{7}{4}&-2\\\\4&\\frac{1}{4}\\end{pmatrix}", "(\\begin{pmatrix}-\\frac{7}{4}&-2\\\\4&\\frac{1}{4}\\\\\\end{pmatrix})", timeout=True))
+
+    # pred = '\\begin{pmatrix}\\frac{1}{3x^{2/3}}&0&0\\\\0&1&0\\\\-\\sin(x)&0&0\\end{pmatrix}'
+    # gt = '(\\begin{pmatrix}\\frac{1}{3\\sqrt[3]{x}^2}&0&0\\\\0&1&0\\\\-\\sin(x)&0&0\\\\\\end{pmatrix})'
+
+    # pred= '-\\frac{8x^2}{9(x^2-2)^{5/3}}+\\frac{2}{3(x^2-2)^{2/3}}'
+    # gt= '-\\frac{2(x^2+6)}{9(x^2-2)\\sqrt[3]{x^2-2}^2}'
+
+    # pred =  '-34x-45y+20z-100=0'
+    # gt = '34x+45y-20z+100=0'
+
+    # pred = '\\frac{100}{3}'
+    # gt = '33.3'
+
+    # pred = '\\begin{pmatrix}0.290243531202435\\\\0.196008371385084\\\\-0.186381278538813\\end{pmatrix}'
+    # gt = '(\\begin{pmatrix}0.29\\\\0.196\\\\-0.186\\\\\\end{pmatrix})'
+
+    # pred = '\\frac{\\sqrt{\\sqrt{11}+\\sqrt{194}}}{2\\sqrt{33}+15}'
+    # gt = '\\frac{\\sqrt{\\sqrt{11}+\\sqrt{194}}}{15+2\\sqrt{33}}'
+
+    # pred = '(+5)(b+2)'
+    # gt = '(a+5)(b+2)'
+
+    # pred = '\\frac{1+\\sqrt{5}}{2}'
+    # gt = '2'
+
+    # pred = '\\frac{34}{16}+\\frac{\\sqrt{1358}}{16}', gt = '4'
+    # pred = '1', gt = '1\\\\sqrt{19}'
+
+    # pred = "(0.6,2.6667]"
+    # gt = "(\\frac{3}{5},\\frac{8}{3}]"
+
+    gt = "x+0.5+0.5"
+    pred = "x+1"
+
+    print(math_equal(pred, gt, timeout=True))
+
+
+if __name__ == "__main__":
+    _test_math_equal()
--- a/functioncall/math/function/handler.py
+++ b/functioncall/math/function/handler.py
@ -0,0 +1,39 @@
+import json
+from parser import extract_answer
+
+from grader import math_equal
+
+
+def process_results(answer, solution):
+    extracted_answer = extract_answer(answer, "math", use_last_number=False)
+    extracted_solution = extract_answer(solution, "math", use_last_number=True)
+
+    print(
+        f"extracted_answer: {extracted_answer}, extracted_solution: {extracted_solution}, equal: {math_equal(extracted_answer, extracted_solution)}"
+    )
+    if extracted_answer is None or extracted_answer.strip() in ["None", "none", ""]:
+        retval = 0
+    elif math_equal(extracted_answer, extracted_solution, timeout=True):
+        retval = 1
+    else:
+        retval = 0
+
+    return retval, (extracted_answer, extracted_solution)
+
+
+def handle(event, context):
+    answers = event.get("answers", "")
+    solutions = event.get("solutions", "")
+
+    print(f"answers:{answers}, solutions:{solutions}\n")
+    # answers and solutions are json lists, and call process_results then collect result into a list
+    if isinstance(answers, str):
+        answers = json.loads(answers)
+    if isinstance(solutions, str):
+        solutions = json.loads(solutions)
+
+    results = []
+    for answer, solution in zip(answers, solutions):
+        results.append(process_results(answer, solution))
+
+    return results
--- a/functioncall/math/function/parser.py
+++ b/functioncall/math/function/parser.py
@ -0,0 +1,765 @@
+import random
+import re
+from typing import Any, Dict, Iterable, List, TypeVar, Union
+
+import regex
+import sympy
+from latex2sympy2 import latex2sympy
+from word2number import w2n
+
+# from utils import *
+
+
+def _fix_fracs(string):
+    substrs = string.split("\\frac")
+    new_str = substrs[0]
+    if len(substrs) > 1:
+        substrs = substrs[1:]
+        for substr in substrs:
+            new_str += "\\frac"
+            if len(substr) > 0 and substr[0] == "{":
+                new_str += substr
+            else:
+                try:
+                    assert len(substr) >= 2
+                except:
+                    return string
+                a = substr[0]
+                b = substr[1]
+                if b != "{":
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}{" + b + "}" + post_substr
+                    else:
+                        new_str += "{" + a + "}{" + b + "}"
+                else:
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}" + b + post_substr
+                    else:
+                        new_str += "{" + a + "}" + b
+    string = new_str
+    return string
+
+
+def _fix_a_slash_b(string):
+    if len(string.split("/")) != 2:
+        return string
+    a = string.split("/")[0]
+    b = string.split("/")[1]
+    try:
+        if "sqrt" not in a:
+            a = int(a)
+        if "sqrt" not in b:
+            b = int(b)
+        assert string == "{}/{}".format(a, b)
+        new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
+        return new_string
+    except:
+        return string
+
+
+def _fix_sqrt(string):
+    _string = re.sub(r"\\sqrt(\w+)", r"\\sqrt{\1}", string)
+    return _string
+
+
+def convert_word_number(text: str) -> str:
+    try:
+        text = str(w2n.word_to_num(text))
+    except:
+        pass
+    return text
+
+
+# units mainly from MathQA
+unit_texts = [
+    "east",
+    "degree",
+    "mph",
+    "kmph",
+    "ft",
+    "m sqaure",
+    " m east",
+    "sq m",
+    "deg",
+    "mile",
+    "q .",
+    "monkey",
+    "prime",
+    "ratio",
+    "profit of rs",
+    "rd",
+    "o",
+    "gm",
+    "p . m",
+    "lb",
+    "tile",
+    "per",
+    "dm",
+    "lt",
+    "gain",
+    "ab",
+    "way",
+    "west",
+    "a .",
+    "b .",
+    "c .",
+    "d .",
+    "e .",
+    "f .",
+    "g .",
+    "h .",
+    "t",
+    "a",
+    "h",
+    "no change",
+    "men",
+    "soldier",
+    "pie",
+    "bc",
+    "excess",
+    "st",
+    "inches",
+    "noon",
+    "percent",
+    "by",
+    "gal",
+    "kmh",
+    "c",
+    "acre",
+    "rise",
+    "a . m",
+    "th",
+    "π r 2",
+    "sq",
+    "mark",
+    "l",
+    "toy",
+    "coin",
+    "sq . m",
+    "gallon",
+    "° f",
+    "profit",
+    "minw",
+    "yr",
+    "women",
+    "feet",
+    "am",
+    "pm",
+    "hr",
+    "cu cm",
+    "square",
+    "v â € ™",
+    "are",
+    "rupee",
+    "rounds",
+    "cubic",
+    "cc",
+    "mtr",
+    "s",
+    "ohm",
+    "number",
+    "kmph",
+    "day",
+    "hour",
+    "minute",
+    "min",
+    "second",
+    "man",
+    "woman",
+    "sec",
+    "cube",
+    "mt",
+    "sq inch",
+    "mp",
+    "∏ cm ³",
+    "hectare",
+    "more",
+    "sec",
+    "unit",
+    "cu . m",
+    "cm 2",
+    "rs .",
+    "rs",
+    "kg",
+    "g",
+    "month",
+    "km",
+    "m",
+    "cm",
+    "mm",
+    "apple",
+    "liter",
+    "loss",
+    "yard",
+    "pure",
+    "year",
+    "increase",
+    "decrease",
+    "d",
+    "less",
+    "Surface",
+    "litre",
+    "pi sq m",
+    "s .",
+    "metre",
+    "meter",
+    "inch",
+]
+
+unit_texts.extend([t + "s" for t in unit_texts])
+
+
+def strip_string(string, skip_unit=False):
+    string = str(string).strip()
+    # linebreaks
+    string = string.replace("\n", "")
+
+    # right "."
+    string = string.rstrip(".")
+
+    # remove inverse spaces
+    # replace \\ with \
+    string = string.replace("\\!", "")
+    # string = string.replace("\\ ", "")
+    # string = string.replace("\\\\", "\\")
+
+    # matrix
+    string = re.sub(r"\\begin\{array\}\{.*?\}", r"\\begin{pmatrix}", string)
+    string = re.sub(r"\\end\{array\}", r"\\end{pmatrix}", string)
+    string = string.replace("bmatrix", "pmatrix")
+
+    # replace tfrac and dfrac with frac
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+    string = (
+        string.replace("\\neq", "\\ne")
+        .replace("\\leq", "\\le")
+        .replace("\\geq", "\\ge")
+    )
+
+    # remove \left and \right
+    string = string.replace("\\left", "")
+    string = string.replace("\\right", "")
+    string = string.replace("\\{", "{")
+    string = string.replace("\\}", "}")
+
+    # Remove unit: miles, dollars if after is not none
+    _string = re.sub(r"\\text{.*?}$", "", string).strip()
+    if _string != "" and _string != string:
+        # print("Warning: unit not removed: '{}' -> '{}'".format(string, _string))
+        string = _string
+
+    if not skip_unit:
+        # Remove unit: texts
+        for _ in range(2):
+            for unit_text in unit_texts:
+                # use regex, the prefix should be either the start of the string or a non-alphanumeric character
+                # the suffix should be either the end of the string or a non-alphanumeric character
+                _string = re.sub(r"(^|\W)" + unit_text + r"($|\W)", r"\1\2", string)
+                if _string != "":
+                    string = _string
+
+    # Remove circ (degrees)
+    string = string.replace("^{\\circ}", "")
+    string = string.replace("^\\circ", "")
+
+    # remove dollar signs
+    string = string.replace("\\$", "")
+    string = string.replace("$", "")
+    string = string.replace("\\(", "").replace("\\)", "")
+
+    # convert word number to digit
+    string = convert_word_number(string)
+
+    # replace "\\text{...}" to "..."
+    string = re.sub(r"\\text\{(.*?)\}", r"\1", string)
+    for key in ["x=", "y=", "z=", "x\\in", "y\\in", "z\\in", "x\\to", "y\\to", "z\\to"]:
+        string = string.replace(key, "")
+    string = string.replace("\\emptyset", r"{}")
+    string = string.replace("(-\\infty,\\infty)", "\\mathbb{R}")
+
+    # remove percentage
+    string = string.replace("\\%", "")
+    string = string.replace("\%", "")
+    string = string.replace("%", "")
+
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+
+    # cdot
+    # string = string.replace("\\cdot", "")
+    if (
+        string.startswith("{")
+        and string.endswith("}")
+        and string.isalnum()
+        or string.startswith("(")
+        and string.endswith(")")
+        and string.isalnum()
+        or string.startswith("[")
+        and string.endswith("]")
+        and string.isalnum()
+    ):
+        string = string[1:-1]
+
+    # inf
+    string = string.replace("infinity", "\\infty")
+    if "\\infty" not in string:
+        string = string.replace("inf", "\\infty")
+    string = string.replace("+\\inity", "\\infty")
+
+    # and
+    string = string.replace("and", "")
+    string = string.replace("\\mathbf", "")
+
+    # use regex to remove \mbox{...}
+    string = re.sub(r"\\mbox{.*?}", "", string)
+
+    # quote
+    string.replace("'", "")
+    string.replace('"', "")
+
+    # i, j
+    if "j" in string and "i" not in string:
+        string = string.replace("j", "i")
+
+    # replace a.000b where b is not number or b is end, with ab, use regex
+    string = re.sub(r"(\d+)\.0*([^\d])", r"\1\2", string)
+    string = re.sub(r"(\d+)\.0*$", r"\1", string)
+
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    if len(string.split("=")) == 2:
+        if len(string.split("=")[0]) <= 2:
+            string = string.split("=")[1]
+
+    string = _fix_sqrt(string)
+    string = string.replace(" ", "")
+
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
+    string = _fix_fracs(string)
+
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    string = _fix_a_slash_b(string)
+
+    return string
+
+
+def extract_multi_choice_answer(pred_str):
+    # TODO: SFT models
+    if "Problem:" in pred_str:
+        pred_str = pred_str.split("Problem:", 1)[0]
+    pred_str = pred_str.replace("choice is", "answer is")
+    patt = regex.search(r"answer is \(?(?P<ans>[abcde])\)?", pred_str.lower())
+    if patt is not None:
+        return patt.group("ans").upper()
+    return "placeholder"
+
+
+direct_answer_trigger_for_fewshot = ("choice is", "answer is")
+
+
+def choice_answer_clean(pred: str):
+    pred = pred.strip("\n")
+
+    # Determine if this is ICL, if so, use \n\n to split the first chunk.
+    ICL = False
+    for trigger in direct_answer_trigger_for_fewshot:
+        if pred.count(trigger) > 1:
+            ICL = True
+    if ICL:
+        pred = pred.split("\n\n")[0]
+
+    # Split the trigger to find the answer.
+    preds = re.split("|".join(direct_answer_trigger_for_fewshot), pred)
+    if len(preds) > 1:
+        answer_flag = True
+        pred = preds[-1]
+    else:
+        answer_flag = False
+
+    pred = pred.strip("\n").rstrip(".").rstrip("/").strip(" ").lstrip(":")
+
+    # Clean the answer based on the dataset
+    tmp = re.findall(r"\b(A|B|C|D|E)\b", pred.upper())
+    if tmp:
+        pred = tmp
+    else:
+        pred = [pred.strip().strip(".")]
+
+    if len(pred) == 0:
+        pred = ""
+    else:
+        if answer_flag:
+            # choose the first element in list ...
+            pred = pred[0]
+        else:
+            # choose the last e
+            pred = pred[-1]
+
+    # Remove the period at the end, again!
+    pred = pred.rstrip(".").rstrip("/")
+
+    return pred
+
+
+def find_box(pred_str: str):
+    ans = pred_str.split("boxed")[-1]
+    if not ans:
+        return ""
+    if ans[0] == "{":
+        stack = 1
+        a = ""
+        for c in ans[1:]:
+            if c == "{":
+                stack += 1
+                a += c
+            elif c == "}":
+                stack -= 1
+                if stack == 0:
+                    break
+                a += c
+            else:
+                a += c
+    else:
+        a = ans.split("$")[0].strip()
+    return a
+
+
+def clean_units(pred_str: str):
+    """Clean the units in the number."""
+
+    def convert_pi_to_number(code_string):
+        code_string = code_string.replace("\\pi", "π")
+        # Replace \pi or π not preceded by a digit or } with 3.14
+        code_string = re.sub(r"(?<![\d}])\\?π", "3.14", code_string)
+        # Replace instances where π is preceded by a digit but without a multiplication symbol, e.g., "3π" -> "3*3.14"
+        code_string = re.sub(r"(\d)(\\?π)", r"\1*3.14", code_string)
+        # Handle cases where π is within braces or followed by a multiplication symbol
+        # This replaces "{π}" with "3.14" directly and "3*π" with "3*3.14"
+        code_string = re.sub(r"\{(\\?π)\}", "3.14", code_string)
+        code_string = re.sub(r"\*(\\?π)", "*3.14", code_string)
+        return code_string
+
+    pred_str = convert_pi_to_number(pred_str)
+    pred_str = pred_str.replace("%", "/100")
+    pred_str = pred_str.replace("$", "")
+    pred_str = pred_str.replace("¥", "")
+    pred_str = pred_str.replace("°C", "")
+    pred_str = pred_str.replace(" C", "")
+    pred_str = pred_str.replace("°", "")
+    return pred_str
+
+
+def extract_theoremqa_answer(pred: str, answer_flag: bool = True):
+    if any([option in pred.lower() for option in ["yes", "true"]]):
+        pred = "True"
+    elif any([option in pred.lower() for option in ["no", "false"]]):
+        pred = "False"
+    elif any(
+        [
+            option in pred.lower()
+            for option in ["(a)", "(b)", "(c)", "(d)", "(e)", "(f)"]
+        ]
+    ):
+        pass
+    else:
+        # Some of the models somehow get used to boxed output from pre-training
+        if "boxed" in pred:
+            pred = find_box(pred)
+
+        if answer_flag:
+            # Extract the numbers out of the string
+            pred = pred.split("=")[-1].strip()
+            pred = clean_units(pred)
+            try:
+                tmp = str(latex2sympy(pred))
+                pred = str(eval(tmp))
+            except Exception:
+                if re.match(r"-?[\d\.]+\s\D+$", pred):
+                    pred = pred.split(" ")[0]
+                elif re.match(r"-?[\d\.]+\s[^\s]+$", pred):
+                    pred = pred.split(" ")[0]
+        else:
+            # desparate search over the last number
+            preds = re.findall(r"-?\d*\.?\d+", pred)
+            if len(preds) >= 1:
+                pred = preds[-1]
+            else:
+                pred = ""
+
+    return pred
+
+
+def extract_answer(pred_str, data_name, use_last_number=True):
+    pred_str = pred_str.replace("\u043a\u0438", "")
+    if data_name in ["mmlu_stem", "sat_math", "aqua", "gaokao2023"]:
+        # TODO check multiple choice
+        return choice_answer_clean(pred_str)
+
+    if "final answer is $" in pred_str and "$. I hope" in pred_str:
+        # minerva_math
+        tmp = pred_str.split("final answer is $", 1)[1]
+        pred = tmp.split("$. I hope", 1)[0].strip()
+    elif "boxed" in pred_str:
+        ans = pred_str.split("boxed")[-1]
+        if len(ans) == 0:
+            return ""
+        elif ans[0] == "{":
+            stack = 1
+            a = ""
+            for c in ans[1:]:
+                if c == "{":
+                    stack += 1
+                    a += c
+                elif c == "}":
+                    stack -= 1
+                    if stack == 0:
+                        break
+                    a += c
+                else:
+                    a += c
+        else:
+            a = ans.split("$")[0].strip()
+        pred = a
+    elif "he answer is" in pred_str:
+        pred = pred_str.split("he answer is")[-1].strip()
+    elif "final answer is" in pred_str:
+        pred = pred_str.split("final answer is")[-1].strip()
+    elif "答案是" in pred_str:
+        # Handle Chinese few-shot multiple choice problem answer extraction
+        pred = pred_str.split("答案是")[1].strip().split("\n\n")[0].strip()
+    else:  # use the last number
+        if use_last_number:
+            pattern = "-?\d*\.?\d+"
+            pred = re.findall(pattern, pred_str.replace(",", ""))
+            if len(pred) >= 1:
+                pred = pred[-1]
+            else:
+                pred = ""
+        else:
+            pred = ""
+
+    # choice answer
+    if data_name in ["sat_math", "aqua"] or "mmlu" in data_name:
+        tmp = re.findall(r"\b(A|B|C|D|E)\b", pred.upper())
+        if tmp:
+            pred = tmp[-1]
+        else:
+            pred = pred.strip().strip(".")
+
+    # multiple line
+    # pred = pred.split("\n")[0]
+    pred = re.sub(r"\n\s*", "", pred)
+    if pred != "" and pred[0] == ":":
+        pred = pred[1:]
+    if pred != "" and pred[-1] == ".":
+        pred = pred[:-1]
+    if pred != "" and pred[-1] == "/":
+        pred = pred[:-1]
+    pred = strip_string(pred, skip_unit=data_name in ["carp_en", "minerva_math"])
+    return pred
+
+
+STRIP_EXCEPTIONS = ["carp_en", "minerva_math"]
+
+
+def parse_ground_truth(example: Dict[str, Any], data_name):
+    if "gt_cot" in example and "gt" in example:
+        if data_name in ["math"]:
+            gt_ans = extract_answer(example["gt_cot"], data_name)
+        elif data_name in STRIP_EXCEPTIONS:
+            gt_ans = example["gt"]
+        else:
+            gt_ans = strip_string(example["gt"])
+        return example["gt_cot"], gt_ans
+
+    # parse ground truth
+    if data_name in ["math", "minerva_math", "math_500"]:
+        gt_cot = example["solution"]
+        gt_ans = extract_answer(gt_cot, data_name)
+    elif data_name == "gsm8k":
+        gt_cot, gt_ans = example["answer"].split("####")
+    elif data_name == "svamp":
+        gt_cot, gt_ans = example["Equation"], example["Answer"]
+    elif data_name == "asdiv":
+        gt_cot = example["formula"]
+        gt_ans = re.sub(r"\(.*?\)", "", example["answer"])
+    elif data_name == "mawps":
+        gt_cot, gt_ans = None, example["target"]
+    elif data_name == "tabmwp":
+        gt_cot = example["solution"]
+        gt_ans = example["answer"]
+        if example["ans_type"] in ["integer_number", "decimal_number"]:
+            if "/" in gt_ans:
+                gt_ans = int(gt_ans.split("/")[0]) / int(gt_ans.split("/")[1])
+            elif "," in gt_ans:
+                gt_ans = float(gt_ans.replace(",", ""))
+            elif "%" in gt_ans:
+                gt_ans = float(gt_ans.split("%")[0]) / 100
+            else:
+                gt_ans = float(gt_ans)
+    elif data_name == "carp_en":
+        gt_cot, gt_ans = example["steps"], example["answer"]
+    elif data_name == "mmlu_stem":
+        abcd = "ABCD"
+        gt_cot, gt_ans = None, abcd[example["answer"]]
+    elif data_name == "sat_math":
+        gt_cot, gt_ans = None, example["Answer"]
+    elif data_name == "aqua":
+        gt_cot, gt_ans = None, example["correct"]
+    elif data_name in ["gaokao2023en", "college_math", "gaokao_math_cloze"]:
+        gt_cot, gt_ans = None, example["answer"].replace("$", "").strip()
+    elif data_name == "gaokao_math_qa":
+        gt_cot, gt_ans = None, example["label"]
+    elif data_name in ["gaokao2024_mix", "cn_middle_school"]:
+        if len(example["choice_answer"]) > 0:
+            gt_cot, gt_ans = None, example["choice_answer"]
+        else:
+            gt_cot, gt_ans = None, example["answer"]
+    elif data_name == "olympiadbench":
+        gt_cot, gt_ans = None, example["final_answer"][0].strip("$")
+    elif data_name in [
+        "aime24",
+        "amc23",
+        "cmath",
+        "gaokao2024_I",
+        "gaokao2024_II",
+        "imo2024",
+    ]:
+        gt_cot, gt_ans = None, example["answer"]
+    elif data_name.startswith("train"):  # train_amc_aime
+        gt_cot, gt_ans = None, example["final_answer"]
+    else:
+        raise NotImplementedError(f"`{data_name}`")
+    # post process
+    gt_cot = str(gt_cot).strip()
+    if data_name not in STRIP_EXCEPTIONS:
+        gt_ans = strip_string(gt_ans, skip_unit=data_name == "carp_en")
+    else:
+        gt_ans = (
+            gt_ans.replace("\\neq", "\\ne")
+            .replace("\\leq", "\\le")
+            .replace("\\geq", "\\ge")
+        )
+    return gt_cot, gt_ans
+
+
+def parse_question(example, data_name):
+    question = ""
+    if data_name == "asdiv":
+        question = f"{example['body'].strip()} {example['question'].strip()}"
+    elif data_name == "svamp":
+        body = example["Body"].strip()
+        if not body.endswith("."):
+            body = body + "."
+        question = f'{body} {example["Question"].strip()}'
+    elif data_name == "tabmwp":
+        title_str = (
+            f'regarding "{example["table_title"]}" ' if example["table_title"] else ""
+        )
+        question = f"Read the following table {title_str}and answer a question:\n"
+        question += f'{example["table"]}\n{example["question"]}'
+        if example["choices"]:
+            question += (
+                f' Please select from the following options: {example["choices"]}'
+            )
+    elif data_name == "carp_en":
+        question = example["content"]
+    elif data_name == "mmlu_stem":
+        options = example["choices"]
+        assert len(options) == 4
+        for i, (label, option) in enumerate(zip("ABCD", options)):
+            options[i] = f"({label}) {str(option).strip()}"
+        options = " ".join(options)
+        # question = f"{example['question'].strip()}\nWhat of the following is the right choice? Explain your answer.\n{options}"
+        question = f"{example['question'].strip()}\nAnswer Choices: {options}"
+    elif data_name == "sat_math":
+        options = example["options"].strip()
+        assert "A" == options[0]
+        options = "(" + options
+        for ch in "BCD":
+            if f" {ch}) " in options:
+                options = regex.sub(f" {ch}\) ", f" ({ch}) ", options)
+        # question = f"{example['question'].strip()}\nWhat of the following is the right choice? Explain your answer.\n{options.strip()}"
+        question = f"{example['question'].strip()}\nAnswer Choices: {options}"
+    elif "aqua" in data_name:
+        options = example["options"]
+        choice = "(" + "(".join(options)
+        choice = choice.replace("(", " (").replace(")", ") ").strip()
+        choice = "\nAnswer Choices: " + choice
+        question = example["question"].strip() + choice
+    elif data_name == "gaokao_math_qa":
+        options_dict = example["options"]
+        options = []
+        for key in options_dict:
+            options.append(f"({key}) {options_dict[key]}")
+        options = " ".join(options)
+        question = f"{example['question'].strip()}\n选项: {options}"
+    elif data_name.startswith("train"):  # train_amc_aime:
+        question = example["prompt"]
+    else:
+        for key in ["question", "problem", "Question", "input"]:
+            if key in example:
+                question = example[key]
+                break
+    # assert question != ""
+    # Yes or No question
+    _, gt_ans = parse_ground_truth(example, data_name)
+    if isinstance(gt_ans, str):
+        gt_lower = gt_ans.lower()
+        if gt_lower in ["true", "false"]:
+            question += " (True or False)"
+        if gt_lower in ["yes", "no"]:
+            question += " (Yes or No)"
+    return question.strip()
+
+
+# def run_execute(executor, result, prompt_type, data_name, execute=False):
+#     if not result or result == "error":
+#         return None, None
+#     report = None
+
+#     if "program_only" in prompt_type:
+#         prediction = extract_program_output(result)
+#     elif prompt_type in ["pot", "pal"] and execute:
+#         code = extract_program(result)
+#         prediction, report = executor.apply(code)
+#     else:
+#         prediction = extract_answer(result, data_name)
+
+#     # prediction = strip_string(prediction, skip_unit=data_name == "carp_en")
+#     prediction = strip_string(prediction, skip_unit=data_name in STRIP_EXCEPTIONS)
+#     return prediction, report
+
+
+def _test_extract_answer():
+    text = """
+This is still not equal to $0$, so we must have made another mistake.
+
+When we subtracted $7$ from $\frac{386}{64}$, we should have subtracted $7 \cdot 64$ from $386$, not the other way around. Let's correct that:
+
+\[\frac{386}{64} - 7 = \frac{386}{64} - \frac{7 \cdot 64}{1 \cdot 64} = \frac{386 - 448}{64} = \frac{-62}{64}.\]
+
+This is still not equal to $0$, so we must have made another mistake.
+
+When we subtracted $7$ from $\frac{386}{64}$, we should have subtracted $7 \cdot 64$ from $386$, not the other way around. Let's correct that:
+
+\[\frac{386}{64} 025
+"""
+    print(extract_answer(text, "math-oai", use_last_number=True))
+    print(choice_answer_clean("\mathrm{(D)\}1,008,016"))
+    # should output a dict
+
+    print(extract_answer("The product of the inner terms is \\($ 15x $\\).", "math"))
+
+
+if __name__ == "__main__":
+    _test_extract_answer()
--- a/functioncall/math/verify.py
+++ b/functioncall/math/verify.py
@ -0,0 +1,108 @@
+import json
+import os
+import time
+from typing import List
+
+from functioncall.base import logging
+from functioncall.base.call import batch_function_call
+
+logger = logging.getLogger("Functioncall")
+
+
+def loadJson(dataDir):
+    with open(dataDir, "r") as f:
+        if dataDir.endswith(".jsonl"):
+            samples = [json.loads(line) for line in f.readlines()]
+        else:
+            samples = json.load(f)
+
+    return samples
+
+
+id2info = loadJson(
+    os.getenv(
+        "REAL_MATH_MEATADATA_PATH",
+        "/storage/datasets/id2info.json",
+    )
+)
+
+
+def math_verify(generateds: List, query_ids: List, batch_size=20, timeout=60) -> List:
+    start_time = time.time()
+    global id2info
+    assert len(generateds) == len(query_ids), (
+        len(generateds),
+        len(query_ids),
+    )
+
+    st = time.monotonic()
+    query_indices = []
+    parameters = []
+    # Collect all (generated, solution) pairs with their original indices
+    for idx, (query_id, generated) in enumerate(zip(query_ids, generateds)):
+        base_query_id = query_id.split("@idx:")[0]
+        info = id2info[base_query_id]
+        for cur_solution in info["solutions"]:
+            parameters.append((generated, cur_solution, idx))
+            query_indices.append(idx)
+
+    # Process in batches
+    batch_args_list = []
+    for i in range(0, len(parameters), batch_size):
+        current_batch = parameters[i : i + batch_size]
+        batch_args = {
+            "answers": [g for g, _, _ in current_batch],
+            "solutions": [s for _, s, _ in current_batch],
+        }
+
+        batch_args_list.append(batch_args)
+
+    results_batch = batch_function_call(batch_args_list, "python_math", timeout)
+
+    labels = [0] * len(query_ids)
+    # Map results back to original indices
+    index = 0
+    for batch_idx, results in enumerate(results_batch):
+        if not isinstance(results, list) or len(results) == 0:
+            index += len(batch_args_list[batch_idx]["answers"])
+            logger.warning(
+                f"Invalid functioncall math results: {results}, batch index:{batch_idx}, query index: {index}."
+            )
+            continue
+
+        for result in results:
+            query_index = query_indices[index]
+            if (
+                isinstance(result, list)
+                and len(result) > 0
+                and (isinstance(result[0], int) and result[0] in [0, 1])
+            ):
+                labels[query_index] = result[0] or labels[query_index]
+            else:
+                logger.warning(
+                    f"Invalid functioncall math result: {result}, index:{index}, qeury_id: {query_ids[query_index]}."
+                )
+
+            index += 1
+
+    logger.info(
+        f"verify math with query size={len(query_ids)}, takes {time.time() - start_time:.4f} seconds, result: {labels}"
+    )
+    return labels
+
+
+if __name__ == "__main__":
+    sample = {
+        "prompt": "",
+        "query_id": "fe11b471-1aa9-4867-958f-a0a811c85f92",
+        "answer": "\\boxed{-\\frac{2}{3}}",
+    }
+    start_time = time.time()
+    batch_size = 10000
+    result = math_verify(
+        [sample["answer"]] * batch_size, [sample["query_id"] for _ in range(batch_size)]
+    )
+
+    hint = f"batch_size: {batch_size}, total time :   {(time.time() - start_time) * 1000:.0f} ms"
+    print(result)
+    print(hint)
--- a/realhf/api/core/data_api.py
+++ b/realhf/api/core/data_api.py
@ -833,6 +833,9 @@ def make_dataset(
 def gather_stat(src: List[Dict]) -> Dict:
    cnt, stats = {}, {}
    for reply in src:
+        # FIXME: understand why the reply can be None
+        if not reply:
+            continue
        for k, v in reply.items():
            cnt[k] = cnt.get(k, 0) + 1
            stats[k] = stats.get(k, 0) + v
--- a/realhf/apps/main.py
+++ b/realhf/apps/main.py
@ -161,6 +161,7 @@ def main_start(args, recover_count: int = 0):
        REAL_RECOVER_RUN="1" if is_recover_run else "0",
        REAL_SAVE_RECOVER_STATES="1" if save_recover_states else "0",
        REAL_MATH_METADATA_PATH=os.getenv("REAL_MATH_METADATA_PATH", ""),
+        REAL_CODE_METADATA_PATH=os.getenv("REAL_CODE_METADATA_PATH", ""),
        REAL_USE_V2_WORKER=os.getenv("REAL_USE_V2_WORKER", "0"),
    )
    for k, v in BASE_ENVIRONS.items():
--- a/realhf/experiments/common/ppo_code_exp.py
+++ b/realhf/experiments/common/ppo_code_exp.py
@ -0,0 +1,556 @@
+# Copyright 2025 Ant Group Inc.
+# Copyright 2024 Wei Fu & Zhiyu Mei
+# Licensed under the Apache License, Version 2.0 (the "License").
+import copy
+import dataclasses
+import math
+import os
+import pprint
+from typing import *
+
+import numpy as np
+from omegaconf import DictConfig, OmegaConf
+
+import realhf.base.logging as logging
+from realhf.api.core.config import (
+    DatasetAbstraction,
+    ModelInterfaceAbstraction,
+    ModelInterfaceType,
+)
+from realhf.api.core.dfg import MFCDef, ParamReallocHook
+from realhf.api.core.model_api import GenerationHyperparameters
+from realhf.api.core.system_api import ExperimentConfig
+from realhf.api.quickstart.dataset import PromptOnlyDatasetConfig
+from realhf.api.quickstart.device_mesh import DeviceMesh, MFCConfig, RPCAllocation
+from realhf.api.quickstart.entrypoint import register_quickstart_exp
+from realhf.api.quickstart.model import ModelTrainEvalConfig, ParallelismConfig
+from realhf.experiments.common.common import CommonExperimentConfig
+from realhf.experiments.common.utils import resolve_replica_ids, resolve_rpc_hooks
+
+logger = logging.getLogger("PPO Code exp", "colored")
+
+
+@dataclasses.dataclass
+class PPOHyperparameters:
+    """Configuration of PPO hyperparameters.
+
+    :param gen: Generation hyperparameters.
+    :type gen: GenerationHyperparameters
+    :param ppo_n_minibatches: Number of minibatches in each PPO update.
+    :type ppo_n_minibatches: int
+    :param kl_ctl: Coefficient of KL divergence rewards.
+    :type kl_ctl: float
+    :param discount: Discount factor.
+    :type discount: float
+    :param gae_lambda: Lambda factor in GAE.
+    :type gae_lambda: float
+    :param eps_clip: PPO actor probability ratio clipping factor.
+    :type eps_clip: float
+    :param value_eps_clip: PPO value clipping factor.
+    :type value_eps_clip: float
+    :param max_reward_clip: Maximum reward value.
+    :type max_reward_clip: float
+    :param reward_output_scaling: Scaling factor of the reward model output.
+    :type reward_output_scaling: float
+    :param reward_output_bias: Bias of the reward model output.
+        The number outputed by the reward model will be
+        CLIP((x - bias) * scaling, -max_reward_clip, max_reward_clip).
+    :type reward_output_bias: float
+    :param early_stop_imp_ratio: PPO update will be early stopped if importance ratio
+        exceeds this maximum value.
+    :type early_stop_imp_ratio: float
+    :param use_adaptive_kl_ctl: Whether to use adaptive KL divergence coefficient.
+    :type use_adaptive_kl_ctl: bool
+    :param adv_norm: Whether to use advantage normalization.
+    :type adv_norm: bool
+    :param value_norm: Whether to denormalize valued and normalize return predictions.
+    :type value_norm: bool
+    :param value_norm_type: Type of value normalization.
+        Either exponential moving average ("exp") or moving average ("ma").
+    :type value_norm_type: str
+    :param value_norm_beta: Exponential decay factor
+        in exponential moving average.
+    :type value_norm_beta: float
+    :param value_norm_eps: Epsilon factor in the
+        denominator of exponential moving average.
+    :type value_norm_eps: float
+    :param disable_value: A shortcut option to disable the critic model.
+    :type disable_value: bool
+    """
+
+    gen: GenerationHyperparameters = dataclasses.field(
+        default_factory=GenerationHyperparameters
+    )
+    ppo_n_minibatches: int = 4
+    kl_ctl: float = 0.1
+    discount: float = 1.0
+    gae_lambda: float = 1.0
+    eps_clip: float = 0.2
+    value_eps_clip: float = 0.2
+    disable_value: bool = False
+    max_reward_clip: float = 20.0
+    reward_output_scaling: float = 1.0
+    reward_output_bias: float = 0.0
+    early_stop_imp_ratio: float = 5.0
+    use_adaptive_kl_ctl: bool = False
+    adv_norm: bool = True
+    value_norm: bool = True
+    value_norm_type: str = dataclasses.field(
+        metadata={"choices": ["exp", "ma"]}, default="exp"
+    )
+    value_norm_beta: float = 0.99995
+    value_norm_eps: float = 1e-5
+
+
+@dataclasses.dataclass
+class PPOCODEConfig(CommonExperimentConfig):
+    """PPO experiment configuration.
+
+    It is a subclass of :class:`CommonExperimentConfig`,
+    so all CLI options in the base class are available.
+
+    We don't implement runtime evaluation for PPO.
+
+    We identify that the RLHF process is composed of four
+    distinct models with independent parameters and six
+    *model function calls* upon these models.
+
+    The four models are\:
+
+    - Actor\: The primary LLM that generates text.
+    - Critic\: The value function that estimates the value of a state.
+    - Ref\: The reference LLM that provides KL regularization.
+    - Rew\: The reward model that provides reward signals.
+
+    The four model function calls and their dependencies are\:
+
+    - Rollout\: Generate text from the actor model.
+    - InfReward\: Infer rewards from the reward model given generated text.
+    - InfRef\: Infer log probabilities from the reference model given generated text.
+    - InfValues\: Infer values from the critic model given generated text.
+    - TrainActor\: Train the actor model given generated text, rewards, values, and reference log probabilities.
+    - TrainCritic\: Train the critic model given generated text, rewards, values, and reference log probabilities.
+
+    This class resolves these dependencies under the hood.
+    What the users should specify are the runtime configurations
+    of models and allocations of *each model function call*.
+
+    :param actor: Runtime configuration of the primary LLM.
+    :type actor: ModelTrainEvalConfig
+    :param critic: Runtime configuration of the critic model of PPO.
+    :type critic: ModelTrainEvalConfig
+    :param ref: Runtime configuration of the reference LLM.
+    :type ref: ModelTrainEvalConfig
+    :param rew: Runtime configuration of the reward LLM.
+    :type rew: ModelTrainEvalConfig
+    :param actor_train: :class:`MFCConfig` for TrainActor.
+    :type actor_train: MFCConfig
+    :param critic_train: :class:`MFCConfig` for TrainCritic.
+    :type critic_train: MFCConfig
+    :param actor_gen: :class:`MFCConfig` for Rollout.
+    :type actor_gen: MFCConfig
+    :param critic_inf: :class:`MFCConfig` for InfValues.
+    :type critic_inf: MFCConfig
+    :param rew_inf: :class:`MFCConfig` for InfReward.
+    :type rew_inf: MFCConfig
+    :param ref_inf: :class:`MFCConfig` for InfRef.
+    :type ref_inf: MFCConfig
+    :param dataset: Dataset configuration.
+    :type dataset: PromptOnlyDatasetConfig
+    :param ppo: Configuration for the PPO algorithm.
+    :type ppo: PPOHyperparameters
+    :param group_size: The number of answers remained for each prompt.
+    :type group_size: int
+    :param generation_size: The number of answers sampled for each prompt.
+        Among them, only `group_size` samples are remained according to
+        the reward score, aka best-of-n sampling. If None, use `group_size`.
+    :type generation_size: Optional[int]
+    :param mask_no_eos_with_zero: Whether to mask out the reward if an
+        answer is truncated due to exceeding the length limit.
+    :type mask_no_eos_with_zero: bool
+    :param mask_too_long: Whether to mask out the PPO loss if an
+        answer is truncated due to exceeding the length limit.
+    :type mask_too_long: bool
+    :param check_verifier_status: If True, raise an error
+        when the reward is all-zero. This usually indicates a bug
+        of the verifier.
+    :type check_verifier_status: bool
+    :param group_adv_norm: Whther to use grouped advantage
+        normaliztion in GRPO.
+    :type group_adv_norm: bool
+    """
+
+    actor: ModelTrainEvalConfig = dataclasses.field(
+        default_factory=ModelTrainEvalConfig
+    )
+    critic: ModelTrainEvalConfig = dataclasses.field(
+        default_factory=ModelTrainEvalConfig
+    )
+    ref: ModelTrainEvalConfig = dataclasses.field(default_factory=ModelTrainEvalConfig)
+    rew: ModelTrainEvalConfig = dataclasses.field(default_factory=ModelTrainEvalConfig)
+
+    # for manual allocation only
+    actor_train: MFCConfig = dataclasses.field(default_factory=MFCConfig)
+    critic_train: MFCConfig = dataclasses.field(default_factory=MFCConfig)
+    actor_gen: MFCConfig = dataclasses.field(default_factory=MFCConfig)
+    critic_inf: MFCConfig = dataclasses.field(default_factory=MFCConfig)
+    rew_inf: MFCConfig = dataclasses.field(default_factory=MFCConfig)
+    ref_inf: MFCConfig = dataclasses.field(default_factory=MFCConfig)
+
+    dataset: PromptOnlyDatasetConfig = dataclasses.field(
+        default_factory=PromptOnlyDatasetConfig
+    )
+
+    ppo: PPOHyperparameters = dataclasses.field(default_factory=PPOHyperparameters)
+
+    group_size: int = 1
+    generation_size: Optional[int] = None
+    mask_no_eos_with_zero: bool = False
+    rm_output_scaling: Optional[float] = None
+    ref_ema_eta: Optional[float] = None
+    group_adv_norm: bool = False
+    mask_too_long: bool = False
+    rw_type: Optional[str] = "sparse"
+    task: str = "code"
+    check_xml_format: bool = False
+    use_dense_reward: bool = False
+    reward_delta: bool = True
+
+    check_verifier_status: bool = False
+
+    dataset_filter_threshold: float = 100.0
+    dataset_max_filter_percentage: float = 0.0
+
+    def __post_init__(self):
+
+        self.ppo_kwargs = dict(
+            n_minibatches=self.ppo.ppo_n_minibatches,
+            kl_ctl=self.ppo.kl_ctl,
+            discount=self.ppo.discount,
+            gae_lambda=self.ppo.gae_lambda,
+            eps_clip=self.ppo.eps_clip,
+            value_eps_clip=self.ppo.value_eps_clip,
+            max_reward_clip=self.ppo.max_reward_clip,
+            adaptive_kl_ctl=self.ppo.use_adaptive_kl_ctl,
+            value_norm=self.ppo.value_norm,
+            value_norm_type=self.ppo.value_norm_type,
+            value_norm_beta=self.ppo.value_norm_beta,
+            value_norm_eps=self.ppo.value_norm_eps,
+            disable_value=self.ppo.disable_value,
+            mask_no_eos_with_zero=self.mask_no_eos_with_zero,
+        )
+
+        if self.rm_output_scaling is None:
+            self.rm_output_scaling = self.ppo.reward_output_scaling
+
+    @property
+    def models(self) -> Dict[str, ModelTrainEvalConfig]:
+        # role to config
+        if self.ppo.disable_value:
+            return {
+                "actor": self.actor,
+                # "critic": self.critic,
+                "ref": self.ref,
+                "reward": self.rew,
+            }
+        else:
+            return {
+                "actor": self.actor,
+                "critic": self.critic,
+                "ref": self.ref,
+                "reward": self.rew,
+            }
+
+    @property
+    def rpcs(self):
+        if (
+            self.dataset.max_prompt_len + self.ppo.gen.max_new_tokens
+            > self.actor.vllm.max_seq_len_to_capture
+        ):
+            raise RuntimeError(
+                f"vllm max seq len to capture {self.actor.vllm.max_seq_len_to_capture} is "
+                f"smaller than the prompt length + generation length {self.dataset.max_prompt_len + self.ppo.gen.max_new_tokens}"
+            )
+        if not os.path.exists(os.getenv("REAL_CODE_METADATA_PATH")):
+            raise RuntimeError(
+                "Dataset json path REAL_CODE_METADATA_PATH does not exist."
+            )
+
+        # interfaces
+        actor_interface = ModelInterfaceAbstraction(
+            "ppo_actor",
+            args={
+                **copy.deepcopy(self.ppo_kwargs),
+                # NOTE: to_container converts the object to a dict
+                # It is used for unifying the profiling API, which requires to
+                # pass external interface configurations in the launch command.
+                # Customized dataclass objects will not work in that case.
+                "generation_config": (
+                    OmegaConf.to_container(self.ppo.gen, resolve=True)
+                    if isinstance(self.ppo.gen, (OmegaConf, DictConfig))
+                    else dataclasses.asdict(self.ppo.gen)
+                ),
+                "early_stop_imp_ratio": self.ppo.early_stop_imp_ratio,
+                "adv_norm": self.ppo.adv_norm,
+                "group_size": self.group_size,
+                "generation_size": self.generation_size,
+                "group_adv_norm": self.group_adv_norm,
+                "mask_too_long": self.mask_too_long,
+                "use_dense_reward": self.use_dense_reward,
+                "reward_delta": self.reward_delta,
+            },
+        )
+        ref_interface = copy.deepcopy(actor_interface)
+        ref_interface.args["enable_save"] = False
+
+        critic_interface = ModelInterfaceAbstraction(
+            "ppo_critic",
+            args={
+                **copy.deepcopy(self.ppo_kwargs),
+                "group_size": self.group_size,
+                "mask_too_long": self.mask_too_long,
+                "use_dense_reward": self.use_dense_reward,
+                "reward_delta": self.reward_delta,
+            },
+        )
+        critic_interface.args.pop("eps_clip")
+        rw_interface = ModelInterfaceAbstraction(
+            "rw_code",
+            args=dict(
+                rw_type=self.rw_type,
+                task=self.task,
+                check_xml_format=self.check_xml_format,
+                tokenizer_path=self.actor.path,
+                enable_save=False,
+                output_scaling=self.ppo.reward_output_scaling,
+                rm_output_scaling=self.rm_output_scaling,
+                output_bias=self.ppo.reward_output_bias,
+                group_size=self.group_size,
+                check_verifier_status=self.check_verifier_status,
+                max_sync_length=self.ppo.gen.max_new_tokens
+                + self.dataset.max_prompt_len
+                + 128,
+            ),
+        )
+        rollout = MFCDef(
+            name="actor_gen",
+            model_name="actor",
+            mb_spec=self.actor_gen.mb_spec,
+            interface_type=ModelInterfaceType.GENERATE,
+            model_type=self.actor.type,
+            model_path=self.actor.path,
+            interface_impl=actor_interface,
+            input_keys=["packed_prompts"],
+            output_keys=[
+                "seq_no_eos_mask",
+                "packed_input_ids",
+                "packed_logprobs",
+                "prompt_mask",
+            ],
+            n_seqs=self.dataset.train_bs_n_seqs,
+        )
+
+        inf_reward = MFCDef(
+            name="rew_inf",
+            model_name="reward",
+            mb_spec=self.rew_inf.mb_spec,
+            interface_type=ModelInterfaceType.INFERENCE,
+            interface_impl=rw_interface,
+            model_type=self.rew.type,
+            model_path=self.rew.path,
+            min_n_seqs_per_pass=1 / self.group_size,
+            input_keys=["packed_input_ids", "packed_prompts"],
+            output_keys=["rewards", "dense_rewards"],
+            n_seqs=self.dataset.train_bs_n_seqs,
+        )
+
+        inf_ref_inputs = ["packed_input_ids"]
+        inf_ref_logits = MFCDef(
+            name="ref_inf",
+            model_name="ref",
+            mb_spec=self.ref_inf.mb_spec,
+            interface_type=ModelInterfaceType.INFERENCE,
+            model_type=self.ref.type,
+            model_path=self.ref.path,
+            interface_impl=ref_interface,
+            min_n_seqs_per_pass=1 / self.group_size,
+            input_keys=inf_ref_inputs,
+            output_keys=["packed_ref_logprobs"],
+            n_seqs=self.dataset.train_bs_n_seqs,
+        )
+
+        inf_values = MFCDef(
+            name="critic_inf",
+            model_name="critic",
+            mb_spec=self.critic_inf.mb_spec,
+            interface_type=ModelInterfaceType.INFERENCE,
+            interface_impl=critic_interface,
+            model_type=self.critic.type,
+            model_path=self.critic.path,
+            min_n_seqs_per_pass=1 / self.group_size,
+            input_keys=["packed_input_ids", "seq_no_eos_mask"],
+            output_keys=["values"],
+            n_seqs=self.dataset.train_bs_n_seqs,
+        )
+
+        train_actor_inputs = [
+            "packed_input_ids",
+            "packed_logprobs",
+            "packed_ref_logprobs",
+            "rewards",
+            "dense_rewards",
+            "values",
+            "prompt_mask",
+            "seq_no_eos_mask",
+        ]
+        if self.ppo.disable_value:
+            train_actor_inputs.remove("values")
+        train_actor = MFCDef(
+            name="actor_train",
+            model_name="actor",
+            mb_spec=self.actor_train.mb_spec,
+            interface_type=ModelInterfaceType.TRAIN_STEP,
+            model_type=self.actor.type,
+            model_path=self.actor.path,
+            interface_impl=actor_interface,
+            input_keys=train_actor_inputs,
+            log_return_value=True,
+            min_n_seqs_per_pass=self.ppo.ppo_n_minibatches / self.group_size,
+            n_seqs=self.dataset.train_bs_n_seqs,
+        )
+
+        train_critic = MFCDef(
+            name="critic_train",
+            model_name="critic",
+            mb_spec=self.critic_train.mb_spec,
+            interface_type=ModelInterfaceType.TRAIN_STEP,
+            interface_impl=critic_interface,
+            model_type=self.critic.type,
+            model_path=self.critic.path,
+            input_keys=[
+                "packed_input_ids",
+                "packed_logprobs",
+                "packed_ref_logprobs",
+                "rewards",
+                "dense_rewards",
+                "values",
+                "prompt_mask",
+                "seq_no_eos_mask",
+            ],
+            log_return_value=True,
+            min_n_seqs_per_pass=self.ppo.ppo_n_minibatches / self.group_size,
+            n_seqs=self.dataset.train_bs_n_seqs,
+        )
+        if self.ppo.disable_value:
+            return {
+                "actor_gen": rollout,
+                "actor_train": train_actor,
+                # "critic_inf": inf_values,
+                # "critic_train": train_critic,
+                "ref_inf": inf_ref_logits,
+                "rew_inf": inf_reward,
+            }
+        else:
+            return {
+                "actor_gen": rollout,
+                "actor_train": train_actor,
+                "critic_inf": inf_values,
+                "critic_train": train_critic,
+                "ref_inf": inf_ref_logits,
+                "rew_inf": inf_reward,
+            }
+
+    @property
+    def allocations(self):
+        if self.ppo.disable_value:
+            return {
+                "actor_gen": self.actor_gen,
+                "actor_train": self.actor_train,
+                # "critic_inf": self.critic_inf,
+                # "critic_train": self.critic_train,
+                "ref_inf": self.ref_inf,
+                "rew_inf": self.rew_inf,
+            }
+        else:
+            return {
+                "actor_gen": self.actor_gen,
+                "actor_train": self.actor_train,
+                "critic_inf": self.critic_inf,
+                "critic_train": self.critic_train,
+                "ref_inf": self.ref_inf,
+                "rew_inf": self.rew_inf,
+            }
+
+    @property
+    def datasets(self):
+        return [
+            DatasetAbstraction(
+                "code_prompt",
+                args=dict(
+                    dataset_path=self.dataset.path,
+                    max_length=self.dataset.max_prompt_len,
+                    fill_to_max_length=self.dataset.fill_to_max_length,
+                    filter_threshold=self.dataset_filter_threshold,
+                    max_filter_percentage=self.dataset_max_filter_percentage,
+                ),
+            )
+        ]
+
+    @property
+    def tokenizer_name_or_path(self) -> str:
+        return self.actor.path
+
+    @property
+    def search_kwargs(self):
+        return {
+            "num_gen_tokens": self.ppo.gen.max_new_tokens,
+            "n_ppo_minibatches": self.ppo.ppo_n_minibatches,
+            "seq_len": self.dataset.max_prompt_len,
+        }
+
+    @property
+    def max_prompt_len(self):
+        return self.dataset.max_prompt_len
+
+    def initial_setup(self) -> ExperimentConfig:
+        rpc_allocs = self._get_rpc_allocations()
+
+        resolve_replica_ids(rpc_allocs, self.models)
+        resolve_rpc_hooks(
+            rpc_allocs, self.models
+        )  # inplace modify MFCDefs in rpc allocations
+
+        pprint.pprint(rpc_allocs)
+
+        ######### update ref model using ema, ref_ema_eta = 0 means fixed ref model #########
+        def _find_rpc(name):
+            return next(alloc.rpc for alloc in rpc_allocs if alloc.rpc.name == name)
+
+        # Remove the offload hook of ref_inf, because
+        # we need to receive parameters from peer GPUs and update it immediately.
+        if self.ref_ema_eta is not None:
+
+            ref_inf = _find_rpc("ref_inf")
+            ref_inf._post_hooks = []
+
+            # Add an unidirectional parameter reallocation hook.
+            actor_train = _find_rpc("actor_train")
+            actor_train.add_post_hook(
+                ParamReallocHook(
+                    target=ref_inf.model_name,
+                    eta=self.ref_ema_eta,
+                )
+            )
+        ######### The main difference from normal PPO #########
+
+        model_worker = self._get_model_worker_configs(rpc_allocs)
+
+        return ExperimentConfig(
+            exp_ctrl=self.exp_ctrl,
+            wandb=self.wandb,
+            model_rpcs=[rpc_alloc.rpc for rpc_alloc in rpc_allocs],
+            model_worker=model_worker,
+        )
+
+
+register_quickstart_exp("ppo-code", PPOCODEConfig)
--- a/realhf/impl/dataset/prompt_dataset.py
+++ b/realhf/impl/dataset/prompt_dataset.py
@ -220,9 +220,150 @@ class MATHPromptDataset(torch.utils.data.Dataset):
        )


+class CODEPromptDataset(torch.utils.data.Dataset):
+
+    def __init__(
+        self,
+        util: data_api.DatasetUtility,
+        max_length: Optional[int] = None,
+        dataset_path: Optional[str] = None,
+        dataset_builder: Optional[Callable[[], List[Dict]]] = None,
+        fill_to_max_length: bool = False,
+        filter_threshold: float = 1e4,
+        max_filter_percentage: float = 0.0,
+    ):
+        """A dataset with prompts. Usually used for PPO.
+
+        Args:
+            util (api.data.DatasetUtility): .
+            max_length (Optional[int], optional): The maximum length of each sequence in the batch.
+            dataset_path (Optional[str], optional): Path to the dataset json/jsonl file.
+                The json/jsonl file should be a list of dictionary. Each element in the list should have
+                a key "prompt". Defaults to None.
+            dataset_builder (Optional[Callable[[], List[Dict]]], optional): Alternative to dataset_path.
+                A callable that returns a list of dictionary. Defaults to None.
+        """
+        self._util = util
+        self.max_length = max_length
+
+        data = data_api.load_shuffle_split_dataset(util, dataset_path, dataset_builder)
+
+        prompts_str = [x["question"] for x in data]
+        self.ids = [x["id"] for x in data]
+        if "scores" in data[0]:
+            self.base_scores = [np.mean(x["scores"]) for x in data]
+        util.tokenizer.padding_side = "left"
+        prompt_encodings = util.tokenizer(
+            prompts_str,
+            truncation=True,
+            # max_length=max_length,
+            padding=False,
+            return_length=True,
+            return_attention_mask=False,
+        )
+
+        if fill_to_max_length:
+            for i in range(len(prompt_encodings["length"])):
+                x = prompt_encodings["input_ids"][i]
+                if max_length > len(x):
+                    # Fill with the final non-pad token to the left.
+                    prompt_encodings["input_ids"][i] = [x[-1]] * (
+                        max_length - len(x)
+                    ) + x
+                    prompt_encodings["length"][i] = max_length
+
+        logger.info(f"{len(data)} samples, checking lengths (max_length={max_length})")
+        indices = [
+            i for i, x in enumerate(prompt_encodings["length"]) if x <= max_length
+        ]
+        logger.info(f"{len(indices)} samples remain")
+
+        self.prompt_lengths = [int(prompt_encodings["length"][idx]) for idx in indices]
+        self.prompts = [prompt_encodings["input_ids"][idx] for idx in indices]
+        self.ids = [
+            str(self.ids[idx]) + f"@idx:{idx}-{util.dp_rank}" for idx in indices
+        ]
+        if "scores" in data[0]:
+            self.base_scores = [self.base_scores[idx] for idx in indices]
+
+        assert all(len(x) == l for x, l in zip(self.prompts, self.prompt_lengths))
+
+        logger.info(f"Number of prompts in the dataset: {len(self.prompts)}")
+
+        self.active_indices = list(range(len(self.prompts)))
+        self.filter_threshold = filter_threshold
+        self.max_filter_percentage = max_filter_percentage
+
+    @property
+    def util(self):
+        return self._util
+
+    def __len__(self):
+        return len(self.active_indices)
+
+    def __getitem__(self, idx):
+        # print(self.base_scores)
+        print(
+            f"CODEPromptDataset idx{idx}, use idx: {self.active_indices[idx]}, size: {len(self.ids)}"
+        )
+        idx = self.active_indices[idx]
+
+        if hasattr(self, "base_scores"):
+            return data_api.SequenceSample.from_default(
+                ids=[self.ids[idx]],
+                seqlens=[self.prompt_lengths[idx]],
+                data=dict(
+                    packed_prompts=torch.tensor(self.prompts[idx], dtype=torch.long),
+                    base_scores=torch.tensor(
+                        [self.base_scores[idx]], dtype=torch.float32
+                    ),
+                ),
+                metadata=dict(random_id=[uuid.uuid4()]),
+            )
+        else:
+            return data_api.SequenceSample.from_default(
+                ids=[self.ids[idx]],
+                seqlens=[self.prompt_lengths[idx]],
+                data=dict(
+                    packed_prompts=torch.tensor(self.prompts[idx], dtype=torch.long)
+                ),
+                metadata=dict(random_id=[uuid.uuid4()]),
+            )
+
+    def filter(self, eval_scores: Dict[Hashable, float]):
+        # Get all data indices that have a higher score than the threshold.
+        idx2scores_to_remove = {}
+        for pop_idx, idx in enumerate(self.active_indices):
+            data_id = self.ids[idx]
+            if data_id not in eval_scores:
+                continue
+            if eval_scores[data_id] > self.filter_threshold:
+                idx2scores_to_remove[pop_idx] = eval_scores[data_id]
+
+        # Control the number of samples to be removed according to max_filter_percentage.
+        n = int(len(self.active_indices) * self.max_filter_percentage)
+        indices_to_remove = sorted(
+            idx2scores_to_remove.keys(),
+            key=lambda x: idx2scores_to_remove[x],
+            reverse=True,
+        )[:n]
+
+        for pop_idx in sorted(indices_to_remove, reverse=True):
+            self.active_indices.pop(pop_idx)
+        logger.info(
+            f"Code prompt dataset DP rank {self.util.dp_rank} filtered"
+            f" {len(indices_to_remove)} samples, {len(self.active_indices)} samples remain. "
+            f"Original dataset size: {len(self.prompts)}. "
+            f"Filter threshold: {self.filter_threshold}. "
+            f"Max filter percentage: {self.max_filter_percentage}. "
+            f"Current number of eval scores: {len(eval_scores)}."
+        )
+
+
 if not __name__ == "__main__":
    data_api.register_dataset("prompt", PromptDataset)
    data_api.register_dataset("math_prompt", MATHPromptDataset)
+    data_api.register_dataset("code_prompt", CODEPromptDataset)
 else:
    from transformers import AutoTokenizer

@ -238,3 +379,21 @@ else:
        max_length=512,
        dataset_path="/storage/openpsi/data/math/Qwen_RL_training_xss/0101/train_rl@0101_with_qwqsft-7b_score.jsonl",
    )
+
+    dataset = CODEPromptDataset(
+        data_api.DatasetUtility(
+            seed=0,
+            dp_rank=0,
+            world_size=1,
+            tokenizer=AutoTokenizer.from_pretrained(
+                "/storage/openpsi/models/Qwen__Qwen2-1.5B-Instruct/"
+            ),
+        ),
+        max_length=512,
+        dataset_path="/storage/datasets/codeparrot-apps-test.jsonl",
+    )
+    data_generator = enumerate(dataset)
+    dataset_batch_counter, cur_sample = next(data_generator)
+    print(
+        f"size: {len(dataset)}, index: {dataset_batch_counter}, cur_sample: {cur_sample}"
+    )
--- a/realhf/impl/model/interface/code_rw_interface.py
+++ b/realhf/impl/model/interface/code_rw_interface.py
@ -0,0 +1,429 @@
+# Copyright 2025 Ant Group Inc.
+import collections
+import dataclasses
+import html
+import json
+import os
+import re
+import xml.etree.ElementTree as ET
+from ast import parse
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import colorama
+import numpy as np
+import torch
+import torch.distributed as dist
+
+import realhf.api.core.model_api as model_api
+import realhf.base.logging as logging
+from functioncall.code.verify import code_verify
+from realhf.api.core.data_api import SequenceSample, load_hf_tokenizer
+from realhf.base import constants
+
+logger = logging.getLogger("Packed Reward Modeling Interface", "benchmark")
+
+
+class CodeVerifierException(Exception):
+    pass
+
+
+def extract_python_code(text, min_length=20, strict_syntax=True):
+    code_pattern = r"(?i)```(?:python|py)?\s*\n?(.*?)\n?```"
+    code_blocks = re.findall(code_pattern, text, re.DOTALL)
+    valid_blocks = []
+    for block in code_blocks:
+        clean_block = block.strip()
+        if len(clean_block) < min_length:
+            continue
+
+        # verify code syntax
+        if strict_syntax:
+            try:
+                parse(clean_block, mode="exec")
+            except (SyntaxError, IndentationError):
+                continue
+
+        valid_blocks.append(clean_block)
+
+    if not valid_blocks:
+        return None
+    # return the last code block
+    return valid_blocks[-1]
+
+
+def check_with_elementtree(text):
+    def escape_between_tags(text, tags=["think", "answer"]):
+        """转义标签之间的内容，但保留标签本身."""
+        # 构建标签模式
+        tag_pattern = "|".join(tags)
+        parts = []
+        current_pos = 0
+
+        # 匹配开始和结束标签
+        pattern = f"</?({tag_pattern})[^>]*>"
+
+        for match in re.finditer(pattern, text):
+            # 添加标签之前的内容（需要转义）
+            if current_pos < match.start():
+                parts.append(html.escape(text[current_pos : match.start()]))
+
+            # 添加标签本身（不转义）
+            parts.append(match.group())
+            current_pos = match.end()
+
+        # 添加最后剩余的内容
+        if current_pos < len(text):
+            parts.append(html.escape(text[current_pos:]))
+
+        return "".join(parts)
+
+    text = escape_between_tags(text)
+    if not text.strip().startswith("<think>"):
+        text = "<think>" + text
+    try:
+        xml_text = f"<root>{text}</root>"
+        x = ET.fromstring(xml_text)
+        if x.text is not None and x.text.strip() != "":
+            return False, f"Error: extra content before <think>. {x.text}"
+        if len(x) != 2:
+            return False, f"Error: there are {len(x)} tags."
+        if x[0].tag is None or x[0].tag != "think":
+            return False, f"Error: <think> tag is missing. {x[0].tag}"
+        if x[0].tail is not None and x[0].tail.strip() != "":
+            return (
+                False,
+                f"Error: extra content between <think> and <answer>. {x[0].tail}",
+            )
+        if x[1].tag is None or x[1].tag != "answer":
+            return False, f"Error: <answer> tag is missing. {x[1].tag}"
+        if x[1].tail is not None and x[1].tail.strip() != "":
+            return False, f"Error: extra content after <answer>, {x[1].tail}"
+
+        return True, x[1].text if x[1].text is not None else ""
+    except ET.ParseError as e:
+        return False, f"Error: XML格式错误, {str(e)}"
+
+
+def retokenize(
+    task,
+    tokenizer,
+    packed_input_ids,
+    input_cu_seqlens,
+    prompts,
+    prompt_cu_seqlens,
+    query_ids,
+    check_xml_format=False,
+    do_eval=False,
+):
+    input_ids = [
+        packed_input_ids[start:end]
+        for start, end in zip(input_cu_seqlens[:-1], input_cu_seqlens[1:])
+    ]
+    prompt_ids = [
+        prompts[start:end]
+        for start, end in zip(prompt_cu_seqlens[:-1], prompt_cu_seqlens[1:])
+    ]
+    seq_strs = tokenizer.batch_decode(
+        input_ids, clean_up_tokenization_spaces=False, skip_special_tokens=True
+    )
+    prompt_strs = tokenizer.batch_decode(
+        prompt_ids, clean_up_tokenization_spaces=False, skip_special_tokens=True
+    )
+    # query_id_strs = query_ids
+    query_id_strs = [query_id.split("@")[0] for query_id in query_ids]
+
+    format_rewards = []
+
+    queryid_to_results = collections.defaultdict(list)
+    # 8 processes on each node, with 10 subprocesses each
+    if do_eval == True:
+        _answers = [
+            seq_str.split(prompt_str)[1]
+            for seq_str, prompt_str in zip(seq_strs, prompt_strs)
+        ]
+
+        codes = [extract_python_code(_answer) for _answer in _answers]
+        logger.info(
+            f"code_rw_interface, size: {len(query_id_strs)}, valid code size: {len(codes)}, query_id_0: {query_id_strs[0]}"
+        )
+        format_rewards = code_verify(codes, query_id_strs)
+
+        if check_xml_format:
+            with ThreadPoolExecutor(max_workers=22) as executor:
+                futures = [
+                    executor.submit(check_with_elementtree, answer_str)
+                    for answer_str in _answers
+                ]
+                # xml_rewards = []
+                for idx, future in enumerate(futures):
+                    xml_reward, _ = future.result()
+                    # xml_rewards.append(xml_reward)
+                    if xml_reward == 1 and format_rewards[idx] == 0:
+                        format_rewards[idx] = -0.8
+                    elif xml_reward == 0 and format_rewards[idx] == 0:
+                        format_rewards[idx] = -1
+
+        for query_id_str, format_reward in zip(query_id_strs, format_rewards):
+            if query_id_str not in queryid_to_results:
+                queryid_to_results[query_id_str] = []
+            queryid_to_results[query_id_str].append(format_reward)
+    else:
+        for query_id_str in query_id_strs:
+            if query_id_str not in queryid_to_results:
+                queryid_to_results[query_id_str] = []
+            queryid_to_results[query_id_str].append(0)
+            format_rewards.append(0)
+
+    return format_rewards, prompt_strs, prompt_ids, seq_strs, queryid_to_results
+
+
+@dataclasses.dataclass
+class PackedCodeRewardInterface(model_api.ModelInterface):
+
+    enable_save: bool = False
+    tokenizer_path: str = "/storage/models/Qwen__Qwen2.5-1.5B"
+    output_scaling: float = 1.0
+    rm_output_scaling: float = 1.0
+    rm_output_bias: float = 0.0
+    output_bias: float = 0.0
+    loss_fun = torch.nn.CrossEntropyLoss(reduction="none")
+    max_sync_length: int = 2048
+    rw_type: str = "sparse"
+    task: str = "code"  # math or countdown or code
+    check_xml_format: bool = False
+    post_process: str = "sigmoid"
+    group_size: int = 1
+    check_verifier_status: bool = False
+
+    _call_count: int = 0
+
+    def __post_init__(self):
+        self.tokenizer = load_hf_tokenizer(self.tokenizer_path)
+        logger.info(f"rm_output_scaling: {self.rm_output_scaling}")
+        logger.info(f"rm_output_bias: {self.rm_output_bias}")
+        logger.info(f"output_scaling: {self.output_scaling}")
+        logger.info(f"output_bias: {self.output_bias}")
+        logger.info(f"max_sync_length: {self.max_sync_length}")
+        logger.info(f"rw_type: {self.rw_type}")
+        logger.info(f"post_process: {self.post_process}")
+
+        while True:
+            gen_file_path = os.path.join(
+                constants.LOG_ROOT,
+                constants.experiment_name(),
+                constants.trial_name(),
+                "generated",
+                f"v{self._call_count}r{dist.get_rank()}.txt",
+            )
+            if os.path.exists(gen_file_path):
+                self._call_count += 1
+            else:
+                break
+        logger.info(f"call_count: {self._call_count}")
+
+    def inference(
+        self,
+        model: model_api.Model,
+        data_: SequenceSample,
+        mb_spec,
+    ) -> SequenceSample:
+
+        packed_input_ids: torch.Tensor = data_.data["packed_input_ids"].squeeze()
+
+        input_seqlens = torch.tensor(data_.seqlens["packed_input_ids"]).view(-1)
+        input_cu_seqlens = torch.nn.functional.pad(
+            input_seqlens.cumsum(0), (1, 0)
+        ).int()
+
+        packed_prompts = data_.data["packed_prompts"]
+        prompts = []
+        prompt_seqlens = []
+        offset = 0
+        for x in data_.seqlens["packed_prompts"]:
+            prompts += [packed_prompts[offset : offset + x[0]]] * self.group_size
+            offset += x[0]
+            prompt_seqlens.extend(x * self.group_size)
+
+        assert offset == sum(x[0] for x in data_.seqlens["packed_prompts"])
+        # non_packed_prompts = copy.deepcopy(prompts)
+        prompts = torch.cat(prompts)
+        prompt_seqlens = torch.tensor(prompt_seqlens).view(-1)
+        prompt_cu_seqlens = torch.nn.functional.pad(
+            prompt_seqlens.cumsum(0), (1, 0)
+        ).int()
+
+        query_ids = [data_id for data_id in data_.ids for _ in range(self.group_size)]
+
+        format_rewards, prompt_strs, prompt_ids, seq_strs, queryid_to_results = (
+            retokenize(
+                self.task,
+                self.tokenizer,
+                packed_input_ids,
+                input_cu_seqlens,
+                prompts,
+                prompt_cu_seqlens,
+                query_ids,
+                check_xml_format=self.check_xml_format,
+                do_eval=constants.is_last_pipe_stage(),
+            )
+        )
+
+        assert self.rw_type == "sparse"
+        dense_scores = torch.zeros_like(packed_input_ids).float()
+        scores = torch.FloatTensor(format_rewards).to(packed_input_ids.device)
+        scores[scores == 0] = -1
+
+        if len(scores) == 0:
+            return None
+
+        assert dense_scores.shape == packed_input_ids.shape
+        scores = (
+            scores.to(packed_input_ids.device) - self.output_bias
+        ) * self.output_scaling
+
+        logger.info(f"Code reward logging info @v{model.version.global_step}")
+        logger.info(
+            f"before: Format success rate: {torch.FloatTensor(format_rewards).mean().item()}"
+        )
+        logger.info(
+            f"number of samples: {len(scores)}, {scores.shape}, group_size: {self.group_size}"
+        )
+
+        gen_file_path = os.path.join(
+            constants.LOG_ROOT,
+            constants.experiment_name(),
+            constants.trial_name(),
+            "generated",
+            f"v{self._call_count}r{dist.get_rank()}.txt",
+        )
+        os.makedirs(os.path.dirname(gen_file_path), exist_ok=True)
+        logger.info(f"Generated samples and rewards will be dumped to: {gen_file_path}")
+        with open(gen_file_path, "w") as _f:
+            for idx, (score, prompt_str, seq_str) in enumerate(
+                zip(scores, prompt_strs, seq_strs)
+            ):
+                info = "\n".join(
+                    [
+                        f"idx: {idx} / {len(scores)}",
+                        f"reward is {score.item()}, prompt is {colorama.Fore.YELLOW + colorama.Style.DIM}{prompt_str}{colorama.Style.RESET_ALL}",
+                        f"sequence is: {colorama.Fore.YELLOW + colorama.Style.DIM}{seq_str.split(prompt_str)[1]}{colorama.Style.RESET_ALL}.",
+                    ]
+                )
+                _f.write(info + "\n")
+
+        gen_file_path = os.path.join(
+            constants.LOG_ROOT,
+            constants.experiment_name(),
+            constants.trial_name(),
+            "generated_jsonl",
+            f"v{self._call_count}r{dist.get_rank()}.jsonl",
+        )
+        os.makedirs(os.path.dirname(gen_file_path), exist_ok=True)
+        logger.info(f"Generated samples and rewards will be dumped to: {gen_file_path}")
+        with open(gen_file_path, "w") as _f:
+            for idx, (score, prompt_str, seq_str) in enumerate(
+                zip(scores, prompt_strs, seq_strs)
+            ):
+                _f.write(
+                    json.dumps(
+                        {
+                            "prompt": prompt_str,
+                            "generated": seq_str.split(prompt_str)[1],
+                            "reward": score.item(),
+                        },
+                        ensure_ascii=False,
+                    )
+                    + "\n"
+                )
+
+        logger.info(
+            f"Format success rate: {torch.FloatTensor(format_rewards).mean().item()}"
+        )
+
+        pass_at_k = np.mean(
+            [sum([xx == 1 for xx in x]) > 0 for x in queryid_to_results.values()]
+        )
+        avg_num_samples = np.mean([len(x) for x in queryid_to_results.values()])
+        logger.info(f"pass@k: {pass_at_k}, num_samples: {avg_num_samples}")
+
+        logger.info(f"number of samples: {len(scores)}, {scores.shape}")
+
+        logger.info(f"reward: {sum(scores) / len(scores)}")
+
+        train_pass_monitor_file_path = os.path.join(
+            constants.LOG_ROOT,
+            constants.experiment_name(),
+            constants.trial_name(),
+            "training_monitor",
+            f"v{self._call_count}r{dist.get_rank()}.jsonl",
+        )
+        os.makedirs(os.path.dirname(train_pass_monitor_file_path), exist_ok=True)
+        logger.info(
+            f"pass monitor result will be dumped to: {train_pass_monitor_file_path}"
+        )
+        with open(train_pass_monitor_file_path, "w") as monitor_file:
+            for key, value in queryid_to_results.items():
+                pass1 = sum(value) / len(value)
+                pass8 = int(sum(value) > 0)
+                monitor_file.write(
+                    json.dumps(
+                        {"query_id": key, "pass1": pass1, "pass8": pass8},
+                        ensure_ascii=False,
+                    )
+                    + "\n"
+                )
+
+        self._call_count += 1
+
+        if scores.dtype != torch.float32:
+            scores = scores.to(torch.float32)
+        if dense_scores.dtype != torch.float32:
+            dense_scores = dense_scores.to(torch.float32)
+
+        res = SequenceSample(
+            keys=["rewards", "dense_rewards"],
+            trailing_shapes=dict(rewards=(), dense_rewards=()),
+            dtypes=dict(rewards=torch.float32, dense_rewards=torch.float32),
+            ids=data_.ids,
+            seqlens=dict(
+                rewards=[
+                    torch.tensor([1 for _ in range(len(x))], dtype=torch.int32)
+                    for x in data_.seqlens["packed_input_ids"]
+                ],
+                dense_rewards=data_.seqlens["packed_input_ids"],
+            ),
+            data=dict(rewards=scores, dense_rewards=dense_scores),
+        )
+
+        # record rewards for each piece of data
+        avg_scores = []
+        offset = 0
+        for i in range(data_.bs):
+            score_lis = scores[
+                offset : offset + len(data_.seqlens["packed_input_ids"][i])
+            ]
+            avg_scores.append(score_lis.mean().item())
+            offset += len(data_.seqlens["packed_input_ids"][i])
+        assert offset == sum(len(x) for x in data_.seqlens["packed_input_ids"])
+
+        res.metadata["scores"] = avg_scores
+
+        if self.check_verifier_status:
+            avg_score = torch.tensor(
+                np.mean(avg_scores), device=constants.current_device()
+            )
+            dist.all_reduce(
+                avg_score, op=dist.ReduceOp.SUM, group=constants.parallelism_group()
+            )
+            avg_score /= constants.parallelism_group_size()
+            avg_score = avg_score.item()
+            minimal_score = (-1 - self.output_bias) * self.rm_output_scaling
+
+            if avg_score <= minimal_score or np.isclose(avg_score, minimal_score):
+                raise CodeVerifierException(
+                    "All rewards are at minimal value. Probably there are something wrong with the verifier!"
+                )
+        return res
+
+
+model_api.register_interface("rw_code", PackedCodeRewardInterface)
--- a/realhf/impl/model/interface/math_parser.py
+++ b/realhf/impl/model/interface/math_parser.py
@ -4,11 +4,11 @@ import json
 import os
 import signal
 import subprocess
+import time
+import traceback
 import uuid
 from typing import *

-import requests
-
 from realhf.base import logging
 from realhf.base.constants import parallelism_rank

@ -112,10 +112,9 @@ def parse_line(prompt_str, generated, query_id):


 def parse_lines_in_parallel(
-    prompt_strs: List,
    generateds: List,
    query_ids: List,
-    max_workers: int,
+    max_workers=22,
    check_xml_format=False,
 ) -> List:
    global id2info
@ -124,12 +123,11 @@ def parse_lines_in_parallel(
            id2info = loadJson(os.environ["REAL_MATH_METADATA_PATH"])
        except KeyError as e:
            raise KeyError("The json file REAL_MATH_METADATA_PATH is not set") from e
-    assert len(prompt_strs) == len(generateds) == len(query_ids), (
-        len(prompt_strs),
+    assert len(generateds) == len(query_ids), (
        len(generateds),
        len(query_ids),
    )
-    bs = len(prompt_strs)
+    bs = len(query_ids)
    mbs = (bs + max_workers - 1) // max_workers

    tmp_ids = []
@ -183,7 +181,7 @@ def parse_lines_in_parallel(
        except ProcessLookupError:
            pass

-    labels = [0 for _ in prompt_strs]
+    labels = [0 for _ in query_ids]
    for i, (tmp_id, query_indices) in enumerate(zip(tmp_ids, all_query_indices)):
        try:
            with open(f"/tmp/{tmp_id}-output.jsonl", "r") as f:
@ -214,7 +212,6 @@ if __name__ == "__main__":

    print(
        parse_lines_in_parallel(
-            [sample["prompt"] for _ in range(100)],
            # [sample["answer_in_box"][0] for _ in range(50)] + [sample["answer_in_box"][1] for _ in range(50)],
            [sample["answer"]] * 100,
            [sample["query_id"] for _ in range(100)],
--- a/realhf/impl/model/interface/math_rw_interface.py
+++ b/realhf/impl/model/interface/math_rw_interface.py
@ -23,6 +23,7 @@ import transformers

 import realhf.api.core.model_api as model_api
 import realhf.base.logging as logging
+from functioncall.math.verify import math_verify
 from realhf.api.core.data_api import SequenceSample, load_hf_tokenizer
 from realhf.base import constants
 from realhf.base.constants import data_parallel_group, data_parallel_world_size
@ -32,6 +33,9 @@ from realhf.impl.model.nn.real_llm_api import ReaLModel

 logger = logging.getLogger("Packed Reward Modeling Interface", "benchmark")

+ENABLE_FUNCTION_CALL = os.getenv("ENABLE_FUNCTION_CALL", True)
+math_verify_call = math_verify if ENABLE_FUNCTION_CALL else parse_lines_in_parallel
+

 class MathVerifierException(Exception):
    pass
@ -127,13 +131,7 @@ def retokenize(
            for seq_str, prompt_str in zip(seq_strs, prompt_strs)
        ]

-        format_rewards = parse_lines_in_parallel(
-            prompt_strs,
-            _answers,
-            query_id_strs,
-            max_workers=22,
-            check_xml_format=False,
-        )
+        format_rewards = math_verify_call(_answers, query_id_strs)
        if check_xml_format:
            with ThreadPoolExecutor(max_workers=22) as executor:
                futures = [
--- a/realhf/impl/model/interface/ppo_interface.py
+++ b/realhf/impl/model/interface/ppo_interface.py
@ -37,7 +37,7 @@ def get_score(prompt_ids, generated, query_ids, tokenizer):
        generated, clean_up_tokenization_spaces=False, skip_special_tokens=True
    )
    query_id_strs = [query_id.split("@")[0] for query_id in query_ids]
-    return parse_lines_in_parallel(prompt_strs, seq_strs, query_id_strs, max_workers=22)
+    return parse_lines_in_parallel(seq_strs, query_id_strs)


 def topk(scores, gen_lengths, k) -> list:
--- a/realhf/system/controller.py
+++ b/realhf/system/controller.py
@ -619,6 +619,7 @@ class RayController:
            REAL_RECOVER_RUN=os.environ.get("REAL_RECOVER_RUN", ""),
            REAL_SAVE_RECOVER_STATES=os.environ.get("REAL_SAVE_RECOVER_STATES", ""),
            REAL_MATH_METADATA_PATH=os.environ.get("REAL_MATH_METADATA_PATH", ""),
+            REAL_CODE_METADATA_PATH=os.getenv("REAL_CODE_METADATA_PATH", ""),
            REAL_USE_V2_WORKER=os.getenv("REAL_USE_V2_WORKER", "0"),
        )
        runtime_env = {
--- a/requirements.txt
+++ b/requirements.txt
@ -51,4 +51,6 @@ paramiko
 torch>2.0.0
 black>=25.1.0
 cookiecutter>2.1.1
+asyncio
+aiohttp
 httpx>=0.28.1