fix(train): unsupported gloo device on win

2026-06-05 01:10:22 +08:00 · 2026-04-18 17:30:48 +08:00
parent cc50ede4fb
commit 3affc9415d
1 changed files with 102 additions and 39 deletions
--- a/infer/modules/train/train.py
+++ b/infer/modules/train/train.py
@@ -106,14 +106,19 @@ def main():
        # patch to unblock people without gpus. there is probably a better way.
        print("NO GPU DETECTED: falling back to CPU - this may take a while")
        n_gpus = 1
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = str(randint(20000, 55555))
    children = []
    logger = utils.get_logger(hps.model_dir)
    if n_gpus == 1:
        # Single GPU: run directly without distributed to avoid gloo issues on Windows
        run(0, 1, hps, logger)
    else:
        master_port = str(randint(20000, 55555))
        os.environ["MASTER_ADDR"] = "127.0.0.1"
        os.environ["MASTER_PORT"] = master_port
        children = []
        for i in range(n_gpus):
            subproc = mp.Process(
                target=run,
-            args=(i, n_gpus, hps, logger),
+                args=(i, n_gpus, hps, logger, master_port),
            )
            children.append(subproc)
            subproc.start()
@@ -122,7 +127,7 @@ def main():
            children[i].join()
-def run(rank, n_gpus, hps: utils.HParams, logger: logging.Logger):
+def run(rank, n_gpus, hps: utils.HParams, logger: logging.Logger, master_port: str = "29500"):
    global global_step
    if rank == 0:
        # logger = utils.get_logger(hps.model_dir)
@@ -131,21 +136,78 @@ def run(rank, n_gpus, hps: utils.HParams, logger: logging.Logger):
        writer = SummaryWriter(log_dir=hps.model_dir)
        writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
    use_distributed = n_gpus > 1
    if use_distributed:
        if os.name == "nt" or not torch.cuda.is_available():
            # On Windows, gloo's create_device(hostname=...) is gated to Linux only
            # in the C++ layer (makeDeviceForHostname). We must use the interface-
            # based path instead: create_device(interface=...) calls
            # makeDeviceForInterface which is not platform-gated.
            import socket as _socket
            try:
                store = dist.TCPStore(
                    host_name="127.0.0.1",
                    port=int(master_port),
                    world_size=n_gpus,
                    is_master=(rank == 0),
                )
            except Exception:
                store = dist.TCPStore(
                    host_name="127.0.0.1",
                    port=int(master_port),
                    world_size=n_gpus,
                    is_master=(rank == 0),
                    use_libuv=False,
                )
            # Discover a working network interface for gloo device creation
            gloo_device = None
            try:
                for idx, ifname in _socket.if_nameindex():
                    try:
                        gloo_device = dist.ProcessGroupGloo.create_device(
                            interface=ifname
                        )
                        print("Try device", idx, "name", ifname)
                        break
                    except RuntimeError as e:
                        print("Try device", idx, "name", ifname, "err:", e)
                        continue
            except (OSError, AttributeError) as e:
                print(e.with_traceback(None))
            if gloo_device is None:
                raise RuntimeError(
                    "Cannot create gloo device on Windows. "
                    "No usable network interface found. "
                    "Try adding your hostname to "
                    "C:\\Windows\\System32\\drivers\\etc\\hosts "
                    "with: 127.0.0.1  " + _socket.gethostname()
                )
            pg_options = dist.ProcessGroupGloo._Options()
            pg_options._devices = [gloo_device]
            dist.init_process_group(
                backend="gloo",
                store=store,
                world_size=n_gpus,
                rank=rank,
                pg_options=pg_options,
            )
        else:
            init_url = f"tcp://127.0.0.1:{master_port}"
            try:
                dist.init_process_group(
-            backend=(
+                    backend="nccl",
-                "gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl"
+                    init_method=init_url,
            ),
            init_method="env://",
                    world_size=n_gpus,
                    rank=rank,
                )
            except:
                dist.init_process_group(
-            backend=(
+                    backend="nccl",
-                "gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl"
+                    init_method=init_url + "?use_libuv=False",
            ),
            init_method="env://?use_libuv=False",
                    world_size=n_gpus,
                    rank=rank,
                )
@@ -221,6 +283,7 @@ def run(rank, n_gpus, hps: utils.HParams, logger: logging.Logger):
    )
    # net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
    # net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
    if use_distributed:
        if hasattr(torch, "xpu") and torch.xpu.is_available():
            pass
        elif torch.cuda.is_available():