AReaL/developer/allocation_parallel.html

613 lines
24 KiB
HTML
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!DOCTYPE html>
<html lang="en" data-content_root="../" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Allocation &amp; Parallelism &#8212; AReaL Documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../_static/styles/theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/bootstrap.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/vendor/fontawesome/6.5.2/css/all.min.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=fa44fd50" />
<link rel="stylesheet" type="text/css" href="../_static/styles/sphinx-book-theme.css?v=eba8b062" />
<link rel="stylesheet" type="text/css" href="../_static/togglebutton.css?v=13237357" />
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
<link rel="stylesheet" type="text/css" href="../_static/mystnb.4510f1fc1dee50b3e5859aac5469c37c29e427902b24a333a5f9fcb2f0b3ac41.css?v=be8a1c11" />
<link rel="stylesheet" type="text/css" href="../_static/sphinx-thebe.css?v=4fa983c6" />
<link rel="stylesheet" type="text/css" href="../_static/sphinx-design.min.css?v=95c83b7e" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b" />
<script src="../_static/vendor/fontawesome/6.5.2/js/all.min.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="../_static/documentation_options.js?v=9eb32ce0"></script>
<script src="../_static/doctools.js?v=9a2dae69"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../_static/clipboard.min.js?v=a7894cd8"></script>
<script src="../_static/copybutton.js?v=f281be69"></script>
<script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
<script>let toggleHintShow = 'Click to show';</script>
<script>let toggleHintHide = 'Click to hide';</script>
<script>let toggleOpenOnPrint = 'true';</script>
<script src="../_static/togglebutton.js?v=4a39c7ea"></script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
<script src="../_static/design-tabs.js?v=f930bc37"></script>
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
<script async="async" src="../_static/sphinx-thebe.js?v=c100c467"></script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
<script>DOCUMENTATION_OPTIONS.pagename = 'developer/allocation_parallel';</script>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Contribution Guide" href="../contrib.html" />
<link rel="prev" title="Algorithm, Interface &amp; Backends" href="algo_interface.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
<input type="checkbox"
class="sidebar-toggle"
id="pst-primary-sidebar-checkbox"/>
<label class="overlay overlay-primary" for="pst-primary-sidebar-checkbox"></label>
<input type="checkbox"
class="sidebar-toggle"
id="pst-secondary-sidebar-checkbox"/>
<label class="overlay overlay-secondary" for="pst-secondary-sidebar-checkbox"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search this book..."
aria-label="Search this book..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<div class="pst-async-banner-revealer d-none">
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item">
<a class="navbar-brand logo" href="../intro.html">
<img src="../_static/logo.png" class="logo__image only-light" alt="AReaL Documentation - Home"/>
<script>document.write(`<img src="../_static/logo.png" class="logo__image only-dark" alt="AReaL Documentation - Home"/>`);</script>
</a></div>
<div class="sidebar-primary-item">
<script>
document.write(`
<button class="btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script></div>
<div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main">
<div class="bd-toc-item navbar-nav active">
<ul class="nav bd-sidenav bd-sidenav__home-link">
<li class="toctree-l1">
<a class="reference internal" href="../intro.html">
Overview
</a>
</li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Tutorial</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../training.html">Training</a></li>
<li class="toctree-l1"><a class="reference internal" href="../eval.html">Evaluation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../troubleshooting.html">Troubleshooting</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Manual</span></p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="exp_launch.html">Launching Procedure</a></li>
<li class="toctree-l1"><a class="reference internal" href="master_worker.html">Master Worker</a></li>
<li class="toctree-l1"><a class="reference internal" href="model_worker.html">Model Worker</a></li>
<li class="toctree-l1"><a class="reference internal" href="algo_interface.html">Algorithm, Interface &amp; Backends</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">Allocation &amp; Parallelism</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Contributing</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../contrib.html">Contribution Guide</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main" role="main">
<div class="sbt-scroll-pixel-helper"></div>
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-bars"></span>
</button></div>
</div>
<div class="header-article-items__end">
<div class="header-article-item">
<div class="article-header-buttons">
<div class="dropdown dropdown-source-buttons">
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Source repositories">
<i class="fab fa-github"></i>
</button>
<ul class="dropdown-menu">
<li><a href="https://github.com/inclusionAI/AReaL" target="_blank"
class="btn btn-sm btn-source-repository-button dropdown-item"
title="Source repository"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fab fa-github"></i>
</span>
<span class="btn__text-container">Repository</span>
</a>
</li>
<li><a href="https://github.com/inclusionAI/AReaL/issues/new?title=Issue%20on%20page%20%2Fdeveloper/allocation_parallel.html&body=Your%20issue%20content%20here." target="_blank"
class="btn btn-sm btn-source-issues-button dropdown-item"
title="Open an issue"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-lightbulb"></i>
</span>
<span class="btn__text-container">Open issue</span>
</a>
</li>
</ul>
</div>
<div class="dropdown dropdown-download-buttons">
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
<i class="fas fa-download"></i>
</button>
<ul class="dropdown-menu">
<li><a href="../_sources/developer/allocation_parallel.md" target="_blank"
class="btn btn-sm btn-download-source-button dropdown-item"
title="Download source file"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file"></i>
</span>
<span class="btn__text-container">.md</span>
</a>
</li>
<li>
<button onclick="window.print()"
class="btn btn-sm btn-download-pdf-button dropdown-item"
title="Print to PDF"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file-pdf"></i>
</span>
<span class="btn__text-container">.pdf</span>
</button>
</li>
</ul>
</div>
<button onclick="toggleFullScreen()"
class="btn btn-sm btn-fullscreen-button"
title="Fullscreen mode"
data-bs-placement="bottom" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-expand"></i>
</span>
</button>
<script>
document.write(`
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"></i>
</button>
`);
</script>
<script>
document.write(`
<button class="btn btn-sm pst-navbar-icon search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass fa-lg"></i>
</button>
`);
</script>
<button class="sidebar-toggle secondary-toggle btn btn-sm" title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-list"></span>
</button>
</div></div>
</div>
</div>
</div>
<div id="jb-print-docs-body" class="onlyprint">
<h1>Allocation & Parallelism</h1>
<!-- Table of contents -->
<div id="print-main-content">
<div id="jb-print-toc">
<div>
<h2> Contents </h2>
</div>
<nav aria-label="Page">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#gpu-allocation">GPU Allocation</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#parallelism-strategies">Parallelism Strategies</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#training">Training</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inference">Inference</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#parameter-partitioning">Parameter Partitioning</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#torch-nccl-communication-groups">Torch NCCL Communication Groups</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#parallelism-ranks">Parallelism Ranks</a></li>
</ul>
</nav>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<section class="tex2jax_ignore mathjax_ignore" id="allocation-parallelism">
<h1>Allocation &amp; Parallelism<a class="headerlink" href="#allocation-parallelism" title="Link to this heading">#</a></h1>
<section id="gpu-allocation">
<h2>GPU Allocation<a class="headerlink" href="#gpu-allocation" title="Link to this heading">#</a></h2>
<p>GPU allocation is controlled by the <code class="docutils literal notranslate"><span class="pre">allocation_mode</span></code> CLI parameter. The most common pattern looks like <code class="docutils literal notranslate"><span class="pre">&quot;sglang.d2t2p1+d1t4p1&quot;</span></code>, which means:</p>
<ul class="simple">
<li><p>The first 4 GPUs are allocated to SGLang for inference with:</p>
<ul>
<li><p>2-way tensor parallelism</p></li>
<li><p>2-way data parallelism</p></li>
</ul>
</li>
<li><p>The remaining GPUs are allocated for training with 4-way tensor parallelism</p></li>
</ul>
</section>
<section id="parallelism-strategies">
<h2>Parallelism Strategies<a class="headerlink" href="#parallelism-strategies" title="Link to this heading">#</a></h2>
<section id="training">
<h3>Training<a class="headerlink" href="#training" title="Link to this heading">#</a></h3>
<p>AReaL supports three parallelism strategies for dense models, similar to Megatron:</p>
<ul class="simple">
<li><p>Data Parallelism: Uses Megatrons DistributedDataParallel with AReaLs balanced DP partitioning algorithm (<code class="docutils literal notranslate"><span class="pre">SequenceSample.split</span></code>)</p></li>
<li><p>Tensor Parallelism: Fully replicates Megatrons <code class="docutils literal notranslate"><span class="pre">ColumnParallelLinear</span></code> and <code class="docutils literal notranslate"><span class="pre">RowParallelLinear</span></code></p></li>
<li><p>Pipeline Parallelism: Developed in-house with 1F1B scheduling (planned to be replaced with an open-source implementation due to maintenance challenges)</p></li>
</ul>
</section>
<section id="inference">
<h3>Inference<a class="headerlink" href="#inference" title="Link to this heading">#</a></h3>
<p>AReaL supports SGLang inference with intra-node tensor parallelism and customized data parallelism.</p>
</section>
<section id="parameter-partitioning">
<h3>Parameter Partitioning<a class="headerlink" href="#parameter-partitioning" title="Link to this heading">#</a></h3>
<p>Each model worker holds multiple model shards based on the allocation configuration.</p>
<p>Example: With 4 GPUs configured as:</p>
<ul class="simple">
<li><p>Actor model: First half GPUs with tensor parallelism</p></li>
<li><p>Critic model: Second half GPUs with pipeline parallelism</p></li>
<li><p>Reference model: All GPUs with tensor and pipeline parallelism</p></li>
</ul>
<p>The parameter distribution would be:</p>
<p><img alt="" src="../_images/param_shard.png" /></p>
</section>
</section>
<section id="torch-nccl-communication-groups">
<h2>Torch NCCL Communication Groups<a class="headerlink" href="#torch-nccl-communication-groups" title="Link to this heading">#</a></h2>
<p>During experiments, the following NCCL communication groups are created:</p>
<ol class="arabic simple">
<li><p>Global group: Includes all experiment GPUs (created in <code class="docutils literal notranslate"><span class="pre">global_comm.py</span></code>)</p></li>
<li><p>Parallelism Group: 3D parallel communication groups for a specific model (may match global group or be a subset, created in <code class="docutils literal notranslate"><span class="pre">topology.py</span></code>)</p></li>
<li><p>Data transfer groups: Groups between all data-parallel processes of any two models for data transfer (created in <code class="docutils literal notranslate"><span class="pre">data_manager.py</span></code>)</p></li>
</ol>
</section>
<section id="parallelism-ranks">
<h2>Parallelism Ranks<a class="headerlink" href="#parallelism-ranks" title="Link to this heading">#</a></h2>
<p>Each model worker has a unique GPU index, but may have different parallel strategy coordinates under different model names (actor, critic, etc.).</p>
<p>Example: GPU 2 might have:</p>
<ul class="simple">
<li><p>TP rank 1 for actor model</p></li>
<li><p>TP rank 0 for reference model</p></li>
</ul>
<p>Parallel strategy coordinates are maintained in <code class="docutils literal notranslate"><span class="pre">realhf.base.constants</span></code> and accessed via:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>with<span class="w"> </span>constants.model_scope<span class="o">(</span>ModelName<span class="o">(</span><span class="s2">&quot;actor&quot;</span>,<span class="w"> </span><span class="m">0</span><span class="o">))</span>:
<span class="w"> </span><span class="nv">dp_rank1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span>constants.data_parallel_rank<span class="o">()</span>
with<span class="w"> </span>constants.model_scope<span class="o">(</span>ModelName<span class="o">(</span><span class="s2">&quot;ref&quot;</span>,<span class="w"> </span><span class="m">0</span><span class="o">))</span>:
<span class="w"> </span><span class="nv">dp_rank2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span>constants.data_parallel_rank<span class="o">()</span>
</pre></div>
</div>
<p>Note: Interface and backend methods are automatically called within a model scope, so the context manager can be omitted in those implementations.</p>
</section>
</section>
<script type="text/x-thebe-config">
{
requestKernel: true,
binderOptions: {
repo: "binder-examples/jupyter-stacks-datascience",
ref: "master",
},
codeMirrorConfig: {
theme: "abcdef",
mode: "python"
},
kernelOptions: {
name: "python3",
path: "./developer"
},
predefinedOutput: true
}
</script>
<script>kernelName = 'python3'</script>
</article>
<footer class="prev-next-footer d-print-none">
<div class="prev-next-area">
<a class="left-prev"
href="algo_interface.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">Algorithm, Interface &amp; Backends</p>
</div>
</a>
<a class="right-next"
href="../contrib.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">Contribution Guide</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> Contents
</div>
<nav class="bd-toc-nav page-toc">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#gpu-allocation">GPU Allocation</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#parallelism-strategies">Parallelism Strategies</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#training">Training</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inference">Inference</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#parameter-partitioning">Parameter Partitioning</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#torch-nccl-communication-groups">Torch NCCL Communication Groups</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#parallelism-ranks">Parallelism Ranks</a></li>
</ul>
</nav></div>
</div></div>
</div>
<footer class="bd-footer-content">
<div class="bd-footer-content__inner container">
<div class="footer-item">
<p class="component-author">
By Wei Fu
</p>
</div>
<div class="footer-item">
<p class="copyright">
© Copyright 2023.
<br/>
</p>
</div>
<div class="footer-item">
</div>
<div class="footer-item">
</div>
</div>
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b"></script>
<footer class="bd-footer">
</footer>
</body>
</html>