mirror of https://github.com/inclusionAI/AReaL
240 lines
7.4 KiB
Python
240 lines
7.4 KiB
Python
import argparse
|
|
import json
|
|
import os
|
|
import subprocess
|
|
from glob import glob
|
|
|
|
import numpy as np
|
|
from rm_maj_eval import group_pred
|
|
from tqdm import tqdm
|
|
from transformers import AutoTokenizer
|
|
from utils import load_jsonl
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
"--data_names", default="math_500,aime24,amc23", type=lambda x: x.split(",")
|
|
)
|
|
parser.add_argument(
|
|
"--model_path",
|
|
default="/storage/openpsi/models/Qwen__Qwen2-1.5B-Instruct/",
|
|
type=str,
|
|
)
|
|
parser.add_argument("--output_path", type=str)
|
|
parser.add_argument("--num_sample_nodes", default=8, type=int)
|
|
parser.add_argument("--samples_per_node", default=4, type=int)
|
|
parser.add_argument("--n_sampling", default=32, type=int)
|
|
parser.add_argument("--prompt_type", default="deepscaler", type=str)
|
|
parser.add_argument("--overwrite", action="store_true")
|
|
parser.add_argument("--evaluate_train", action="store_true")
|
|
parser.add_argument("--max_gen_tokens", default=32768, type=int)
|
|
|
|
args = parser.parse_args()
|
|
if args.output_path is None:
|
|
args.output_path = args.model_path
|
|
return args
|
|
|
|
|
|
def eval_maj_k_metrics(data_list, k=8):
|
|
# print(f"evaluating maj@{k}")
|
|
|
|
count, right_count = 0, 0
|
|
for sample in data_list:
|
|
assert len(sample["score"]) >= k, sample
|
|
groups, majority_pred = group_pred(
|
|
sample["pred"][:k], strip=False, use_symbol=False
|
|
)
|
|
idx = groups[majority_pred][0]
|
|
right_count += sample["score"][idx]
|
|
count += 1
|
|
|
|
task_acc = right_count / count * 100
|
|
# print(f"maj@{k}: {task_acc:.1f}")
|
|
return task_acc
|
|
|
|
|
|
def pass_at_k(data_list, k=8):
|
|
|
|
def cur_pass_k(n, c, k):
|
|
if n - c < k:
|
|
return 1.0
|
|
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
|
|
|
|
# count, right_count = 0, 0
|
|
pass_at_ks = []
|
|
for sample in data_list:
|
|
assert len(sample["score"]) >= k, sample
|
|
correct = sum(sample["score"])
|
|
pass_at_ks.append(cur_pass_k(len(sample["score"]), correct, k))
|
|
|
|
return np.mean(pass_at_ks) * 100
|
|
|
|
|
|
def get_metrics(fname_pattern, tokenizer, is_greedy):
|
|
|
|
generated = []
|
|
lengths = []
|
|
results = {}
|
|
|
|
for fname in glob(fname_pattern):
|
|
datas = load_jsonl(fname)
|
|
for data in tqdm(datas, desc=fname):
|
|
|
|
# tokenize
|
|
generated.extend(data["code"])
|
|
if len(generated) > 2000:
|
|
encodings = tokenizer(generated, return_length=True)
|
|
lengths.extend(encodings["length"])
|
|
generated = []
|
|
|
|
# answer score
|
|
cur_idx = data["idx"]
|
|
if cur_idx not in results:
|
|
results[cur_idx] = {"pred": [], "score": []}
|
|
|
|
results[cur_idx]["pred"] += data["pred"]
|
|
results[cur_idx]["score"] += data["score"]
|
|
|
|
if generated:
|
|
encodings = tokenizer(generated, return_length=True)
|
|
lengths.extend(encodings["length"])
|
|
|
|
print(len(lengths))
|
|
assert len(lengths) != 0
|
|
if is_greedy:
|
|
return {
|
|
"greedy_length": np.mean(lengths),
|
|
"greedy_acc": pass_at_k(results.values(), 1),
|
|
"num_questions": len(lengths),
|
|
}
|
|
else:
|
|
return {
|
|
"sample_length": np.mean(lengths),
|
|
"sample_pass@1": pass_at_k(results.values(), 1),
|
|
"pass@8": pass_at_k(results.values(), 8),
|
|
"pass@16": pass_at_k(results.values(), 16),
|
|
}
|
|
|
|
|
|
def process_single_data_name(args, data_name, base_dir, tokenizer):
|
|
cur_dir = os.path.join(base_dir, data_name)
|
|
greedy_prefix = f"test_{args.prompt_type}_-1_seed0_t0.0_s0_e-1_n1"
|
|
sampling_prefix = (
|
|
f"test_{args.prompt_type}_-1_seed*_t0.6_s0_e-1_n{args.samples_per_node}"
|
|
)
|
|
|
|
greedy_length_metrics = get_metrics(
|
|
os.path.join(cur_dir, greedy_prefix + ".jsonl"), tokenizer, True
|
|
)
|
|
sampling_metrics = get_metrics(
|
|
os.path.join(cur_dir, sampling_prefix + ".jsonl"), tokenizer, False
|
|
)
|
|
|
|
sample_length = sampling_metrics.pop("sample_length")
|
|
output = dict(
|
|
num_questions=greedy_length_metrics["num_questions"],
|
|
greedy_length=greedy_length_metrics["greedy_length"],
|
|
sample_length=sample_length,
|
|
greedy_acc=greedy_length_metrics["greedy_acc"],
|
|
**sampling_metrics,
|
|
)
|
|
|
|
return output
|
|
|
|
|
|
if __name__ == "__main__":
|
|
args = parse_args()
|
|
print(f"Evaluation output to {args.output_path}")
|
|
assert args.num_sample_nodes * args.samples_per_node >= args.n_sampling
|
|
|
|
eval_dir = (
|
|
"math_eval"
|
|
if args.max_gen_tokens == 4096
|
|
else f"math_eval_{args.max_gen_tokens}"
|
|
)
|
|
|
|
base_dir = os.path.join(args.output_path, eval_dir)
|
|
os.makedirs(base_dir, exist_ok=True)
|
|
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
|
|
result_path = os.path.join(base_dir, f"aggregate_parallel_{args.prompt_type}.json")
|
|
|
|
if (
|
|
args.prompt_type == "qwen-boxed"
|
|
and os.path.exists(os.path.join(base_dir, f"aggregate_parallel.json"))
|
|
and not os.path.exists(result_path)
|
|
):
|
|
os.system(
|
|
f'cp {os.path.join(base_dir, f"aggregate_parallel.json")} {result_path}'
|
|
)
|
|
|
|
if not os.path.exists(result_path) or args.overwrite or args.evaluate_train:
|
|
log_path = os.path.join(base_dir, "logs")
|
|
os.makedirs(log_path, exist_ok=True)
|
|
with open(os.path.join(log_path, "greedy.log"), "w") as f:
|
|
subprocess.run(
|
|
[
|
|
"sh",
|
|
"sh/eval_greedy.sh",
|
|
args.model_path,
|
|
str(args.max_gen_tokens),
|
|
",".join(args.data_names),
|
|
args.prompt_type,
|
|
args.output_path,
|
|
],
|
|
text=True,
|
|
stdout=f,
|
|
stderr=f,
|
|
)
|
|
|
|
for i in range(args.num_sample_nodes):
|
|
with open(
|
|
os.path.join(
|
|
log_path, f"seed-{i+1}-sample-{args.samples_per_node}.log"
|
|
),
|
|
"w",
|
|
) as f:
|
|
subprocess.run(
|
|
[
|
|
"sh",
|
|
"sh/eval_sample_with_seed.sh",
|
|
args.model_path,
|
|
str(i + 1),
|
|
str(args.samples_per_node),
|
|
str(args.max_gen_tokens),
|
|
",".join(args.data_names),
|
|
args.prompt_type,
|
|
args.output_path,
|
|
],
|
|
text=True,
|
|
stdout=f,
|
|
stderr=f,
|
|
)
|
|
|
|
all_results = dict()
|
|
for data_name in args.data_names:
|
|
all_results[data_name] = process_single_data_name(
|
|
args, data_name, base_dir, tokenizer
|
|
)
|
|
|
|
if not args.evaluate_train:
|
|
with open(result_path, "w") as f:
|
|
json.dump(all_results, f, indent=2)
|
|
|
|
else:
|
|
with open(result_path) as f:
|
|
all_results = json.load(f)
|
|
|
|
try:
|
|
from prettytable import PrettyTable
|
|
|
|
table = PrettyTable()
|
|
field_names = ["dataset"] + list(all_results[args.data_names[0]].keys())
|
|
table.field_names = field_names
|
|
for k, v in all_results.items():
|
|
table.add_row([k, *[round(v[x], 1) for x in field_names[1:]]])
|
|
|
|
print(table)
|
|
except:
|
|
print(json.dumps(all_results, indent=2))
|