mirror of https://github.com/inclusionAI/AReaL
345 lines
11 KiB
Python
345 lines
11 KiB
Python
import bisect
|
|
import json
|
|
import os
|
|
import re
|
|
from collections import defaultdict
|
|
from typing import Dict, List, Optional, Set, Tuple
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
def get_percentile(rating: float, sorted_ratings: List[float]) -> float:
|
|
idx = bisect.bisect_left(sorted_ratings, float(rating))
|
|
return round(idx / len(sorted_ratings) * 100, 1)
|
|
|
|
|
|
def read_ratings(file_path: str) -> List[float]:
|
|
with open(file_path, "r") as f:
|
|
ratings_dict = json.load(f)
|
|
|
|
sorted_ratings = []
|
|
for rating, count in ratings_dict.items():
|
|
sorted_ratings.extend([float(rating)] * count)
|
|
|
|
return sorted(sorted_ratings)
|
|
|
|
|
|
def load_cached_contest_data(cache_file_path: str) -> Dict:
|
|
if not os.path.exists(cache_file_path):
|
|
raise FileNotFoundError(f"Cache file does not exist: {cache_file_path}")
|
|
|
|
with open(cache_file_path, "r") as f:
|
|
data = json.load(f)
|
|
|
|
print(f"Loaded {len(data)} contest data from cache file")
|
|
return data
|
|
|
|
|
|
def get_contest_data_from_cache(
|
|
contest_id: int, cached_data: Dict
|
|
) -> Tuple[Optional[Dict], Optional[Dict]]:
|
|
contest_id_str = str(contest_id)
|
|
|
|
if contest_id_str not in cached_data:
|
|
print(f"Warning: Contest {contest_id} data not found in cache")
|
|
return None, None
|
|
|
|
contest_data = cached_data[contest_id_str]
|
|
|
|
try:
|
|
standings = contest_data["standings"]
|
|
rating_changes = contest_data["rating_changes"]
|
|
|
|
if standings.get("status") != "OK" or rating_changes.get("status") != "OK":
|
|
print(f"Warning: Contest {contest_id} cached data status abnormal")
|
|
return None, None
|
|
|
|
return standings, rating_changes
|
|
|
|
except KeyError as e:
|
|
print(f"Warning: Contest {contest_id} cached data structure abnormal: {e}")
|
|
return None, None
|
|
|
|
|
|
def calc_elo_rating_offline(
|
|
contest_id: int,
|
|
problem_status: Dict[str, List[bool]],
|
|
sorted_ratings: List[float],
|
|
cached_data: Dict,
|
|
pass_n=None,
|
|
) -> Optional[Tuple[int, float]]:
|
|
try:
|
|
standings, rating_changes = get_contest_data_from_cache(contest_id, cached_data)
|
|
|
|
if standings is None or rating_changes is None:
|
|
return None
|
|
|
|
handle_set: Set[str] = set()
|
|
try:
|
|
handle_set_standings = set(
|
|
standings["result"]["rows"][i]["party"]["members"][0]["handle"]
|
|
for i in range(len(standings["result"]["rows"]))
|
|
)
|
|
|
|
handle_set_ratings = set(
|
|
rating_changes["result"][i]["handle"]
|
|
for i in range(len(rating_changes["result"]))
|
|
)
|
|
|
|
handle_set = handle_set_standings.intersection(handle_set_ratings)
|
|
|
|
standings["result"]["rows"] = [
|
|
row
|
|
for row in standings["result"]["rows"]
|
|
if row["party"]["members"][0]["handle"] in handle_set
|
|
]
|
|
|
|
rating_changes["result"] = [
|
|
change
|
|
for change in rating_changes["result"]
|
|
if change["handle"] in handle_set
|
|
]
|
|
|
|
assert (
|
|
len(standings["result"]["rows"]) == len(rating_changes["result"])
|
|
and len(standings["result"]["rows"]) > 200
|
|
)
|
|
except Exception:
|
|
return None
|
|
|
|
if (
|
|
"result" not in standings
|
|
or "result" not in rating_changes
|
|
or len(standings["result"]["rows"]) != len(rating_changes["result"])
|
|
or len(standings["result"]["rows"]) <= 200
|
|
):
|
|
return None
|
|
|
|
max_rating = max(change["oldRating"] for change in rating_changes["result"])
|
|
|
|
score = 0
|
|
penalty = 0
|
|
|
|
for problem in standings["result"]["problems"]:
|
|
prob = f"{problem['contestId']}{problem['index']}"
|
|
if prob in problem_status:
|
|
if pass_n is None:
|
|
pass_n = len(problem_status[prob])
|
|
for ith, status in enumerate(problem_status[prob][:pass_n]):
|
|
if status == 1.0:
|
|
if "points" in problem:
|
|
score += max(0, problem["points"] - 50 * ith)
|
|
else:
|
|
score += 1
|
|
penalty += ith * 10
|
|
break
|
|
|
|
n = len(standings["result"]["rows"])
|
|
|
|
rank = n
|
|
for i in range(n):
|
|
if standings["result"]["rows"][i]["points"] < score or (
|
|
standings["result"]["rows"][i]["points"] == score
|
|
and standings["result"]["rows"][i]["penalty"] > penalty
|
|
):
|
|
rank = i
|
|
break
|
|
|
|
l, r = 0, max_rating + 100
|
|
while r - l > 1:
|
|
mid = (l + r) // 2
|
|
new_seed = 1
|
|
for i in range(n):
|
|
new_seed += 1 / (
|
|
1 + 10 ** ((mid - rating_changes["result"][i]["oldRating"]) / 400)
|
|
)
|
|
if new_seed < rank:
|
|
r = mid
|
|
else:
|
|
l = mid
|
|
|
|
percentile = get_percentile(l, sorted_ratings)
|
|
return l, percentile
|
|
|
|
except Exception as e:
|
|
print(f"Error calculating contest {contest_id} ELO rating: {e}")
|
|
return None
|
|
|
|
|
|
def format_grouped_contest_data(
|
|
submissions: List[List[bool]], problem_ids: List[str]
|
|
) -> List[Tuple[int, Dict[str, List[bool]]]]:
|
|
if len(submissions) != len(problem_ids):
|
|
raise ValueError("Length of submissions and problem_ids must be the same.")
|
|
|
|
grouped_data = defaultdict(dict)
|
|
|
|
for problem_id, submission in zip(problem_ids, submissions):
|
|
# Extract contest ID using regex to capture leading digits
|
|
match = re.match(r"(\d+)([A-Z].*)", problem_id)
|
|
if not match:
|
|
raise ValueError(f"Invalid problem ID format: {problem_id}")
|
|
|
|
contest_id = int(match.group(1))
|
|
problem_letter = match.group(0)
|
|
|
|
grouped_data[contest_id][problem_letter] = submission
|
|
|
|
combined_data = [
|
|
(contest_id, problems) for contest_id, problems in grouped_data.items()
|
|
]
|
|
|
|
return combined_data
|
|
|
|
|
|
def convert_score_to_cf_format(
|
|
all_samples: List[Dict], metadata: List[str]
|
|
) -> List[List[bool]]:
|
|
cf_results = []
|
|
|
|
sorted_samples = sorted(all_samples, key=lambda x: x["idx"])
|
|
|
|
for sample in sorted_samples:
|
|
if "score" in sample:
|
|
cf_results.append([bool(s) for s in sample["score"]])
|
|
else:
|
|
cf_results.append([False])
|
|
|
|
return cf_results
|
|
|
|
|
|
class CFEloCalculator:
|
|
def __init__(
|
|
self,
|
|
metadata_path: str = None,
|
|
ratings_path: str = None,
|
|
cache_file_path: str = None,
|
|
):
|
|
# Set default paths
|
|
# current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
current_dir = "/storage/openpsi/data/code/test_set/codeforces"
|
|
|
|
self.metadata_path = metadata_path or os.path.join(
|
|
current_dir, "metadata_cf.json"
|
|
)
|
|
self.ratings_path = ratings_path or os.path.join(
|
|
current_dir, "ratings_2024.json"
|
|
)
|
|
self.cache_file_path = cache_file_path or os.path.join(
|
|
current_dir, "all_contest_data.json"
|
|
)
|
|
|
|
# Preload data
|
|
self._load_data()
|
|
|
|
def _load_data(self):
|
|
try:
|
|
self.sorted_ratings = read_ratings(self.ratings_path)
|
|
print(f"✓ Loaded {len(self.sorted_ratings)} historical rating data")
|
|
|
|
with open(self.metadata_path, "r") as file:
|
|
self.metadata = json.load(file)
|
|
print(f"✓ Loaded {len(self.metadata)} problem metadata")
|
|
|
|
self.cached_data = load_cached_contest_data(self.cache_file_path)
|
|
print(f"✓ Loaded cached data")
|
|
|
|
except Exception as e:
|
|
raise RuntimeError(f"Failed to load data files: {e}")
|
|
|
|
def calculate_elo(
|
|
self, all_samples: List[Dict], pass_n: int = 1, verbose: bool = True
|
|
) -> Optional[Dict]:
|
|
try:
|
|
if verbose:
|
|
print("\n" + "=" * 50)
|
|
print("Starting Codeforces ELO rating calculation...")
|
|
print("=" * 50)
|
|
# Convert data format
|
|
cf_results = convert_score_to_cf_format(all_samples, self.metadata)
|
|
if verbose:
|
|
print(f"✓ Converted {len(cf_results)} test results")
|
|
|
|
# Format data
|
|
model_results = format_grouped_contest_data(cf_results, self.metadata)
|
|
if verbose:
|
|
print(f"✓ Data grouped by {len(model_results)} contests")
|
|
|
|
# Calculate ELO rating for each contest
|
|
contest_elos = []
|
|
skipped_contests = []
|
|
|
|
iterator = (
|
|
tqdm(model_results, desc="Calculating ELO ratings")
|
|
if verbose
|
|
else model_results
|
|
)
|
|
for contest_id, problems in iterator:
|
|
elo_result = calc_elo_rating_offline(
|
|
contest_id, problems, self.sorted_ratings, self.cached_data, pass_n
|
|
)
|
|
if elo_result is not None:
|
|
contest_elos.append((contest_id, elo_result))
|
|
else:
|
|
skipped_contests.append(contest_id)
|
|
|
|
# Calculate average percentile
|
|
percentiles = [elo[1][1] for elo in contest_elos if elo[1] is not None]
|
|
ratings = [elo[1][0] for elo in contest_elos if elo[1] is not None]
|
|
|
|
if not percentiles:
|
|
print("Error: No valid percentiles calculated")
|
|
return None
|
|
|
|
estimated_rating = sum(ratings) / len(ratings)
|
|
est_percentile = get_percentile(estimated_rating, self.sorted_ratings)
|
|
|
|
# Display results
|
|
if verbose:
|
|
print("\n" + "=" * 50)
|
|
print("CODEFORCES EVALUATION RESULTS")
|
|
print("=" * 50)
|
|
print(f"Estimated percentile: {est_percentile:.1f}%")
|
|
print(f"Estimated Codeforces rating: {estimated_rating:.0f}")
|
|
|
|
if skipped_contests:
|
|
print(
|
|
f"Skipped contest IDs: {skipped_contests[:10]}{'...' if len(skipped_contests) > 10 else ''}"
|
|
)
|
|
|
|
print("=" * 50)
|
|
|
|
# Return detailed results
|
|
return {
|
|
"estimated_percentile": est_percentile,
|
|
"estimated_rating": estimated_rating,
|
|
"contests_processed": len(contest_elos),
|
|
"contests_skipped": len(skipped_contests),
|
|
"skipped_contest_ids": skipped_contests,
|
|
"individual_contest_results": [
|
|
{
|
|
"contest_id": contest_id,
|
|
"rating": rating,
|
|
"percentile": percentile,
|
|
}
|
|
for contest_id, (rating, percentile) in contest_elos
|
|
],
|
|
}
|
|
|
|
except Exception as e:
|
|
print(f"Error calculating CF ELO rating: {e}")
|
|
return None
|
|
|
|
|
|
def calculate_cf_elo_from_samples(
|
|
all_samples: List[Dict],
|
|
pass_n: int = 1,
|
|
metadata_path: str = None,
|
|
ratings_path: str = None,
|
|
cache_file_path: str = None,
|
|
verbose: bool = True,
|
|
) -> Optional[Dict]:
|
|
|
|
calculator = CFEloCalculator(metadata_path, ratings_path, cache_file_path)
|
|
return calculator.calculate_elo(all_samples, pass_n, verbose)
|