AReaL/eval.html

643 lines
30 KiB
HTML
Executable File

<!DOCTYPE html>
<html lang="en" data-content_root="./" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Evaluation &#8212; AReaL Documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="_static/styles/theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="_static/styles/bootstrap.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="_static/styles/pydata-sphinx-theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="_static/vendor/fontawesome/6.5.2/css/all.min.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.5.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.5.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.5.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
<link rel="stylesheet" type="text/css" href="_static/styles/sphinx-book-theme.css?v=eba8b062" />
<link rel="stylesheet" type="text/css" href="_static/togglebutton.css?v=13237357" />
<link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
<link rel="stylesheet" type="text/css" href="_static/mystnb.4510f1fc1dee50b3e5859aac5469c37c29e427902b24a333a5f9fcb2f0b3ac41.css?v=be8a1c11" />
<link rel="stylesheet" type="text/css" href="_static/sphinx-thebe.css?v=4fa983c6" />
<link rel="stylesheet" type="text/css" href="_static/sphinx-design.min.css?v=95c83b7e" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b" />
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b" />
<script src="_static/vendor/fontawesome/6.5.2/js/all.min.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="_static/documentation_options.js?v=9eb32ce0"></script>
<script src="_static/doctools.js?v=9a2dae69"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/clipboard.min.js?v=a7894cd8"></script>
<script src="_static/copybutton.js?v=f281be69"></script>
<script src="_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
<script>let toggleHintShow = 'Click to show';</script>
<script>let toggleHintHide = 'Click to hide';</script>
<script>let toggleOpenOnPrint = 'true';</script>
<script src="_static/togglebutton.js?v=4a39c7ea"></script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
<script src="_static/design-tabs.js?v=f930bc37"></script>
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
<script async="async" src="_static/sphinx-thebe.js?v=c100c467"></script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
<script>DOCUMENTATION_OPTIONS.pagename = 'eval';</script>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="Troubleshooting" href="troubleshooting.html" />
<link rel="prev" title="Training" href="training.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
<input type="checkbox"
class="sidebar-toggle"
id="pst-primary-sidebar-checkbox"/>
<label class="overlay overlay-primary" for="pst-primary-sidebar-checkbox"></label>
<input type="checkbox"
class="sidebar-toggle"
id="pst-secondary-sidebar-checkbox"/>
<label class="overlay overlay-secondary" for="pst-secondary-sidebar-checkbox"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search this book..."
aria-label="Search this book..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<div class="pst-async-banner-revealer d-none">
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item">
<a class="navbar-brand logo" href="intro.html">
<img src="_static/logo.png" class="logo__image only-light" alt="AReaL Documentation - Home"/>
<script>document.write(`<img src="_static/logo.png" class="logo__image only-dark" alt="AReaL Documentation - Home"/>`);</script>
</a></div>
<div class="sidebar-primary-item">
<script>
document.write(`
<button class="btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script></div>
<div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main">
<div class="bd-toc-item navbar-nav active">
<ul class="nav bd-sidenav bd-sidenav__home-link">
<li class="toctree-l1">
<a class="reference internal" href="intro.html">
Overview
</a>
</li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Tutorial</span></p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="training.html">Training</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">Evaluation</a></li>
<li class="toctree-l1"><a class="reference internal" href="troubleshooting.html">Troubleshooting</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Manual</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="developer/exp_launch.html">Launching Procedure</a></li>
<li class="toctree-l1"><a class="reference internal" href="developer/master_worker.html">Master Worker</a></li>
<li class="toctree-l1"><a class="reference internal" href="developer/model_worker.html">Model Worker</a></li>
<li class="toctree-l1"><a class="reference internal" href="developer/algo_interface.html">Algorithm, Interface &amp; Backends</a></li>
<li class="toctree-l1"><a class="reference internal" href="developer/allocation_parallel.html">Allocation &amp; Parallelism</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Contributing</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="contrib.html">Contribution Guide</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main" role="main">
<div class="sbt-scroll-pixel-helper"></div>
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-bars"></span>
</button></div>
</div>
<div class="header-article-items__end">
<div class="header-article-item">
<div class="article-header-buttons">
<div class="dropdown dropdown-source-buttons">
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Source repositories">
<i class="fab fa-github"></i>
</button>
<ul class="dropdown-menu">
<li><a href="https://github.com/inclusionAI/AReaL" target="_blank"
class="btn btn-sm btn-source-repository-button dropdown-item"
title="Source repository"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fab fa-github"></i>
</span>
<span class="btn__text-container">Repository</span>
</a>
</li>
<li><a href="https://github.com/inclusionAI/AReaL/issues/new?title=Issue%20on%20page%20%2Feval.html&body=Your%20issue%20content%20here." target="_blank"
class="btn btn-sm btn-source-issues-button dropdown-item"
title="Open an issue"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-lightbulb"></i>
</span>
<span class="btn__text-container">Open issue</span>
</a>
</li>
</ul>
</div>
<div class="dropdown dropdown-download-buttons">
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
<i class="fas fa-download"></i>
</button>
<ul class="dropdown-menu">
<li><a href="_sources/eval.md" target="_blank"
class="btn btn-sm btn-download-source-button dropdown-item"
title="Download source file"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file"></i>
</span>
<span class="btn__text-container">.md</span>
</a>
</li>
<li>
<button onclick="window.print()"
class="btn btn-sm btn-download-pdf-button dropdown-item"
title="Print to PDF"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file-pdf"></i>
</span>
<span class="btn__text-container">.pdf</span>
</button>
</li>
</ul>
</div>
<button onclick="toggleFullScreen()"
class="btn btn-sm btn-fullscreen-button"
title="Fullscreen mode"
data-bs-placement="bottom" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-expand"></i>
</span>
</button>
<script>
document.write(`
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"></i>
</button>
`);
</script>
<script>
document.write(`
<button class="btn btn-sm pst-navbar-icon search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass fa-lg"></i>
</button>
`);
</script>
<button class="sidebar-toggle secondary-toggle btn btn-sm" title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-list"></span>
</button>
</div></div>
</div>
</div>
</div>
<div id="jb-print-docs-body" class="onlyprint">
<h1>Evaluation</h1>
<!-- Table of contents -->
<div id="print-main-content">
<div id="jb-print-toc">
<div>
<h2> Contents </h2>
</div>
<nav aria-label="Page">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#setup-evaluation-environment">Setup Evaluation Environment</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#install-dependencies-and-run-evaluation">Install Dependencies and Run Evaluation</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#command-line-parameters">Command Line Parameters</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#evaluation-results">Evaluation Results</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#metrics-explanation">Metrics Explanation</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#configuration-details">Configuration Details</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#sampling-parameters">Sampling Parameters</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#runtime-expectations">Runtime Expectations</a></li>
</ul>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<section class="tex2jax_ignore mathjax_ignore" id="evaluation">
<h1>Evaluation<a class="headerlink" href="#evaluation" title="Link to this heading">#</a></h1>
<p>The evaluation code is located in the <code class="docutils literal notranslate"><span class="pre">evaluation</span></code> folder of the repository. Following the previous tutorial, trained checkpoints will be saved under <code class="docutils literal notranslate"><span class="pre">/storage/ray/experiments/checkpoints/root/</span></code>.</p>
<section id="setup-evaluation-environment">
<h2>Setup Evaluation Environment<a class="headerlink" href="#setup-evaluation-environment" title="Link to this heading">#</a></h2>
<p>Start a new container to execute the evaluation script. <strong>Note</strong>: Evaluation requires updates to certain Python libraries, so avoid using the training container for this task.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>docker<span class="w"> </span>run<span class="w"> </span>-d<span class="w"> </span>--name<span class="w"> </span>areal-eval<span class="w"> </span>--privileged<span class="w"> </span>--gpus<span class="w"> </span>all<span class="w"> </span>--network<span class="w"> </span>host<span class="w"> </span>--shm-size<span class="w"> </span>700g<span class="w"> </span>-v<span class="w"> </span>/storage:/storage<span class="w"> </span>ghcr.io/inclusionai/areal-runtime:v0.3.0<span class="w"> </span>/bin/bash<span class="w"> </span>-c<span class="w"> </span><span class="s2">&quot;tail -f /dev/null&quot;</span>
docker<span class="w"> </span><span class="nb">exec</span><span class="w"> </span>-it<span class="w"> </span>areal-eval<span class="w"> </span>bash
</pre></div>
</div>
</section>
<section id="install-dependencies-and-run-evaluation">
<h2>Install Dependencies and Run Evaluation<a class="headerlink" href="#install-dependencies-and-run-evaluation" title="Link to this heading">#</a></h2>
<p>Execute the following commands inside the Docker container:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">cd</span><span class="w"> </span>/storage/codes/AReaL/evaluation
<span class="nb">cd</span><span class="w"> </span>latex2sympy
pip<span class="w"> </span>install<span class="w"> </span>-e<span class="w"> </span>.
<span class="nb">cd</span><span class="w"> </span>..
pip<span class="w"> </span>install<span class="w"> </span>-r<span class="w"> </span>requirements.txt<span class="w"> </span>
pip<span class="w"> </span>install<span class="w"> </span><span class="nv">vllm</span><span class="o">==</span><span class="m">0</span>.8.5<span class="w"> </span>--no-build-isolation
pip<span class="w"> </span>install<span class="w"> </span><span class="nv">transformers</span><span class="o">==</span><span class="m">4</span>.51.1
pip<span class="w"> </span>install<span class="w"> </span>prettytable<span class="w"> </span>timeout_decorator
mkdir<span class="w"> </span>-p<span class="w"> </span>/storage/ray/eval_output/
nohup<span class="w"> </span>python<span class="w"> </span>eval_and_aggregate.py<span class="w"> </span><span class="se">\</span>
<span class="w"> </span>--model_path<span class="w"> </span>/storage/ray/experiments/checkpoints/root/my-exp/my-trial/epoch1epochstep20globalstep20/<span class="w"> </span><span class="se">\</span>
<span class="w"> </span>--output_path<span class="w"> </span>/storage/ray/eval_output/<span class="w"> </span><span class="se">\</span>
<span class="w"> </span>--data_names<span class="w"> </span><span class="s2">&quot;math_500,aime24,amc23&quot;</span><span class="w"> </span><span class="se">\</span>
<span class="w"> </span>--max_gen_tokens<span class="w"> </span><span class="m">32768</span><span class="w"> </span><span class="p">&amp;</span>&gt;<span class="w"> </span>/storage/ray/eval_output/eval_and_aggregate_parallel.log<span class="w"> </span><span class="p">&amp;</span>
</pre></div>
</div>
<section id="command-line-parameters">
<h3>Command Line Parameters<a class="headerlink" href="#command-line-parameters" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong><code class="docutils literal notranslate"><span class="pre">--model_path</span></code></strong>: Path to the saved model parameters</p></li>
<li><p><strong><code class="docutils literal notranslate"><span class="pre">--output_path</span></code></strong>: Path to store generated answers and log files during evaluation</p></li>
<li><p><strong><code class="docutils literal notranslate"><span class="pre">--data_names</span></code></strong>: Dataset(s) to evaluate. Multiple datasets can be separated by commas. Available options: <code class="docutils literal notranslate"><span class="pre">math_500</span></code>, <code class="docutils literal notranslate"><span class="pre">math</span></code>, <code class="docutils literal notranslate"><span class="pre">gsm8k</span></code>, <code class="docutils literal notranslate"><span class="pre">train_amc_aime</span></code>, <code class="docutils literal notranslate"><span class="pre">aime24</span></code>, <code class="docutils literal notranslate"><span class="pre">amc23</span></code></p></li>
<li><p><strong><code class="docutils literal notranslate"><span class="pre">--max_gen_tokens</span></code></strong>: Maximum length of generated answers (default: 32768)</p></li>
</ul>
</section>
</section>
<section id="evaluation-results">
<h2>Evaluation Results<a class="headerlink" href="#evaluation-results" title="Link to this heading">#</a></h2>
<p>The evaluation script will output a results table in the terminal:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="o">+----------+---------------+---------------+---------------+------------+---------------+--------+---------+</span>
<span class="o">|</span> <span class="n">dataset</span> <span class="o">|</span> <span class="n">num_questions</span> <span class="o">|</span> <span class="n">greedy_length</span> <span class="o">|</span> <span class="n">sample_length</span> <span class="o">|</span> <span class="n">greedy_acc</span> <span class="o">|</span> <span class="n">sample_pass</span><span class="o">@</span><span class="mi">1</span> <span class="o">|</span> <span class="k">pass</span><span class="o">@</span><span class="mi">8</span> <span class="o">|</span> <span class="k">pass</span><span class="o">@</span><span class="mi">16</span> <span class="o">|</span>
<span class="o">+----------+---------------+---------------+---------------+------------+---------------+--------+---------+</span>
<span class="o">|</span> <span class="n">math_500</span> <span class="o">|</span> <span class="mi">500</span> <span class="o">|</span> <span class="mf">6757.4</span> <span class="o">|</span> <span class="mf">4139.5</span> <span class="o">|</span> <span class="mf">84.4</span> <span class="o">|</span> <span class="mf">92.7</span> <span class="o">|</span> <span class="mf">97.3</span> <span class="o">|</span> <span class="mf">97.7</span> <span class="o">|</span>
<span class="o">|</span> <span class="n">aime24</span> <span class="o">|</span> <span class="mi">30</span> <span class="o">|</span> <span class="mf">19328.0</span> <span class="o">|</span> <span class="mf">13663.5</span> <span class="o">|</span> <span class="mf">50.0</span> <span class="o">|</span> <span class="mf">50.4</span> <span class="o">|</span> <span class="mf">77.3</span> <span class="o">|</span> <span class="mf">80.0</span> <span class="o">|</span>
<span class="o">|</span> <span class="n">amc23</span> <span class="o">|</span> <span class="mi">40</span> <span class="o">|</span> <span class="mf">8850.0</span> <span class="o">|</span> <span class="mf">6526.2</span> <span class="o">|</span> <span class="mf">80.0</span> <span class="o">|</span> <span class="mf">90.5</span> <span class="o">|</span> <span class="mf">96.8</span> <span class="o">|</span> <span class="mf">98.8</span> <span class="o">|</span>
<span class="o">+----------+---------------+---------------+---------------+------------+---------------+--------+---------+</span>
</pre></div>
</div>
<section id="metrics-explanation">
<h3>Metrics Explanation<a class="headerlink" href="#metrics-explanation" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong><code class="docutils literal notranslate"><span class="pre">{greedy|sample}_length</span></code></strong>: Average answer length under greedy or random sampling strategy</p></li>
<li><p><strong><code class="docutils literal notranslate"><span class="pre">greedy_acc</span></code></strong>: Average accuracy under greedy sampling</p></li>
<li><p><strong><code class="docutils literal notranslate"><span class="pre">sample_pass&#64;{k}</span></code></strong>: Probability of generating a correct answer within <code class="docutils literal notranslate"><span class="pre">k</span></code> attempts under random sampling</p></li>
</ul>
</section>
</section>
<section id="configuration-details">
<h2>Configuration Details<a class="headerlink" href="#configuration-details" title="Link to this heading">#</a></h2>
<section id="sampling-parameters">
<h3>Sampling Parameters<a class="headerlink" href="#sampling-parameters" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p>The evaluation script defaults to averaging 32 samples with temperature 0.6</p></li>
<li><p>We observed that the <code class="docutils literal notranslate"><span class="pre">enforce_eager</span></code> parameter in vLLM significantly impacts evaluation performance</p></li>
<li><p>When <code class="docutils literal notranslate"><span class="pre">enforce_eager=True</span></code>, we can reproduce the model performance reported in previous work</p></li>
<li><p>Without this setting, evaluation results may fall below reported performance</p></li>
<li><p>Therefore, we enforce <code class="docutils literal notranslate"><span class="pre">enforce_eager=True</span></code> during evaluation</p></li>
</ul>
</section>
<section id="runtime-expectations">
<h3>Runtime Expectations<a class="headerlink" href="#runtime-expectations" title="Link to this heading">#</a></h3>
<p>Due to the sampling requirements and <code class="docutils literal notranslate"><span class="pre">enforce_eager</span></code> setting, the evaluation process typically takes considerable time.</p>
<p>Runtime depends on several factors:</p>
<ul class="simple">
<li><p>Maximum generation length</p></li>
<li><p>Number of questions in the dataset</p></li>
<li><p>Model size</p></li>
</ul>
<p><strong>Performance benchmarks</strong> (on 8x H100 GPUs):</p>
<ul class="simple">
<li><p><strong>AIME dataset</strong>: ~80 minutes</p></li>
<li><p><strong>MATH_500 dataset</strong>: ~160 minutes</p></li>
</ul>
</section>
</section>
</section>
<script type="text/x-thebe-config">
{
requestKernel: true,
binderOptions: {
repo: "binder-examples/jupyter-stacks-datascience",
ref: "master",
},
codeMirrorConfig: {
theme: "abcdef",
mode: "python"
},
kernelOptions: {
name: "python3",
path: "./."
},
predefinedOutput: true
}
</script>
<script>kernelName = 'python3'</script>
</article>
<footer class="prev-next-footer d-print-none">
<div class="prev-next-area">
<a class="left-prev"
href="training.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">Training</p>
</div>
</a>
<a class="right-next"
href="troubleshooting.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">Troubleshooting</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> Contents
</div>
<nav class="bd-toc-nav page-toc">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#setup-evaluation-environment">Setup Evaluation Environment</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#install-dependencies-and-run-evaluation">Install Dependencies and Run Evaluation</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#command-line-parameters">Command Line Parameters</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#evaluation-results">Evaluation Results</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#metrics-explanation">Metrics Explanation</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#configuration-details">Configuration Details</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#sampling-parameters">Sampling Parameters</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#runtime-expectations">Runtime Expectations</a></li>
</ul>
</li>
</ul>
</nav></div>
</div></div>
</div>
<footer class="bd-footer-content">
<div class="bd-footer-content__inner container">
<div class="footer-item">
<p class="component-author">
By Wei Fu
</p>
</div>
<div class="footer-item">
<p class="copyright">
© Copyright 2023.
<br/>
</p>
</div>
<div class="footer-item">
</div>
<div class="footer-item">
</div>
</div>
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b"></script>
<footer class="bd-footer">
</footer>
</body>
</html>