AReaL/arealite/launcher/sglang_server.py

import os
import subprocess
import sys
import time
from pathlib import Path
from typing import Optional

import ray
import requests

from arealite.api.cli_args import (
    NameResolveConfig,
    SGLangConfig,
    parse_cli_args,
    to_structured_cfg,
)
from arealite.api.io_struct import AllocationMode, AllocationType
from arealite.utils.network import find_free_ports, gethostip
from realhf.base import logging, name_resolve, names, pkg_version

logger = logging.getLogger("SGLangServer Wrapper")


def execute_shell_command(command: str) -> subprocess.Popen:
    """
    Execute a shell command and return its process handle.
    """
    # Replace newline continuations and split the command string.
    command = command.replace("\\\n", " ").replace("\\", " ")
    parts = command.split()
    return subprocess.Popen(
        parts,
        text=True,
        stdout=sys.stdout,
        stderr=subprocess.STDOUT,
    )


def apply_sglang_patch():
    p = Path(os.path.dirname(__file__))
    patch_path = str(
        p.parent.parent
        / "patch"
        / "sglang"
        / f"v{pkg_version.get_version('sglang')}.patch"
    )

    target_path = ""
    sglang_meta = subprocess.check_output(
        "python3 -m pip show sglang", shell=True
    ).decode("ascii")
    for line in sglang_meta.split("\n"):
        line = line.strip()
        if line.startswith("Editable project location: "):
            target_path = str(Path(line.split(": ")[1]).parent)

    if target_path:
        proc = subprocess.Popen(
            ["git", "apply", patch_path],
            cwd=target_path,
            stderr=sys.stdout,
            stdout=sys.stdout,
        )
        proc.wait()
        logger.info(f"Applied SGLang patch at {target_path}")


def launch_server_cmd(command: str):
    """
    Launch the server using the given command.
    If no port is specified, a free port is reserved.
    """
    if not ray.is_initialized():
        apply_sglang_patch()
    process = execute_shell_command(command)
    return process


def wait_for_server(base_url: str, timeout: Optional[int] = None) -> None:
    """Wait for the server to be ready by polling the /v1/models endpoint.

    Args:
        base_url: The base URL of the server
        timeout: Maximum time to wait in seconds. None means wait forever.
    """
    start_time = time.time()
    while True:
        try:
            response = requests.get(
                f"{base_url}/v1/models",
                headers={"Authorization": "Bearer None"},
            )
            if response.status_code == 200:
                time.sleep(5)
                break

            if timeout and time.time() - start_time > timeout:
                raise TimeoutError("Server did not become ready within timeout period")
        except requests.exceptions.RequestException:
            time.sleep(1)


class SGLangServerWrapper:
    def __init__(
        self,
        experiment_name: str,
        trial_name: str,
        sglang_config: SGLangConfig,
        tp_size: int,
        n_gpus_per_node: int,
    ):
        self.experiment_name = experiment_name
        self.trial_name = trial_name
        self.config = sglang_config
        self.tp_size = tp_size
        self.server_process = None
        self.n_gpus_per_node = n_gpus_per_node

    def run(self):
        gpus_per_server = len(os.getenv("CUDA_VISIBLE_DEVICES").split(","))
        server_local_idx = (
            int(os.getenv("CUDA_VISIBLE_DEVICES").split(",")[0]) // gpus_per_server
        )
        n_servers_per_node = max(1, self.n_gpus_per_node // gpus_per_server)
        ports_per_server = 40000 // n_servers_per_node
        port_range = (
            server_local_idx * ports_per_server + 10000,
            (server_local_idx + 1) * ports_per_server + 10000,
        )
        server_port, dist_init_port = find_free_ports(2, port_range)

        dist_init_addr = f"localhost:{dist_init_port}"
        host_ip = gethostip()

        cmd = SGLangConfig.build_cmd(
            self.config, tp_size, 0, host_ip, server_port, dist_init_addr=dist_init_addr
        )
        self.server_process = launch_server_cmd(cmd)
        wait_for_server(f"http://{host_ip}:{server_port}")

        name = names.gen_servers(self.experiment_name, self.trial_name)
        name_resolve.add_subentry(name, f"{host_ip}:{server_port}")

        logger.info(f"SGLang server launched at: http://{host_ip}:{server_port}")
        return_code = self.server_process.wait()
        logger.info(
            f"SGLang server at http://{host_ip}:{server_port} exits, returncode={return_code}"
        )

    def __del__(self):
        if self.server_process and self.server_process.poll() is None:
            logger.info("Terminating SGLang server process...")
            self.server_process.terminate()
            self.server_process.wait()
            logger.info("SGLang server process terminated.")


def main_sglang_server(argv):
    config, _ = parse_cli_args(argv)
    config.sglang = to_structured_cfg(config.sglang, SGLangConfig)
    config.cluster.name_resolve = to_structured_cfg(
        config.cluster.name_resolve, NameResolveConfig
    )
    name_resolve.reconfigure(config.cluster.name_resolve)

    allocation_mode = config.allocation_mode
    allocation_mode = AllocationMode.from_str(allocation_mode)
    assert allocation_mode.type_ == AllocationType.DECOUPLED_SGLANG
    tp_size = allocation_mode.gen_tp_size

    sglang_server = SGLangServerWrapper(
        config.experiment_name,
        config.trial_name,
        config.sglang,
        tp_size,
        n_gpus_per_node=config.n_gpus_per_node,
    )
    sglang_server.run()


if __name__ == "__main__":
    main_sglang_server(sys.argv[1:])