1
0
mirror of https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git synced 2026-06-05 01:10:22 +08:00

fix(train): unsupported gloo device on win

This commit is contained in:
源文雨
2026-04-18 17:30:48 +08:00
parent cc50ede4fb
commit 3affc9415d

View File

@@ -106,14 +106,19 @@ def main():
# patch to unblock people without gpus. there is probably a better way. # patch to unblock people without gpus. there is probably a better way.
print("NO GPU DETECTED: falling back to CPU - this may take a while") print("NO GPU DETECTED: falling back to CPU - this may take a while")
n_gpus = 1 n_gpus = 1
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = str(randint(20000, 55555))
children = []
logger = utils.get_logger(hps.model_dir) logger = utils.get_logger(hps.model_dir)
if n_gpus == 1:
# Single GPU: run directly without distributed to avoid gloo issues on Windows
run(0, 1, hps, logger)
else:
master_port = str(randint(20000, 55555))
os.environ["MASTER_ADDR"] = "127.0.0.1"
os.environ["MASTER_PORT"] = master_port
children = []
for i in range(n_gpus): for i in range(n_gpus):
subproc = mp.Process( subproc = mp.Process(
target=run, target=run,
args=(i, n_gpus, hps, logger), args=(i, n_gpus, hps, logger, master_port),
) )
children.append(subproc) children.append(subproc)
subproc.start() subproc.start()
@@ -122,7 +127,7 @@ def main():
children[i].join() children[i].join()
def run(rank, n_gpus, hps: utils.HParams, logger: logging.Logger): def run(rank, n_gpus, hps: utils.HParams, logger: logging.Logger, master_port: str = "29500"):
global global_step global global_step
if rank == 0: if rank == 0:
# logger = utils.get_logger(hps.model_dir) # logger = utils.get_logger(hps.model_dir)
@@ -131,21 +136,78 @@ def run(rank, n_gpus, hps: utils.HParams, logger: logging.Logger):
writer = SummaryWriter(log_dir=hps.model_dir) writer = SummaryWriter(log_dir=hps.model_dir)
writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval")) writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
use_distributed = n_gpus > 1
if use_distributed:
if os.name == "nt" or not torch.cuda.is_available():
# On Windows, gloo's create_device(hostname=...) is gated to Linux only
# in the C++ layer (makeDeviceForHostname). We must use the interface-
# based path instead: create_device(interface=...) calls
# makeDeviceForInterface which is not platform-gated.
import socket as _socket
try:
store = dist.TCPStore(
host_name="127.0.0.1",
port=int(master_port),
world_size=n_gpus,
is_master=(rank == 0),
)
except Exception:
store = dist.TCPStore(
host_name="127.0.0.1",
port=int(master_port),
world_size=n_gpus,
is_master=(rank == 0),
use_libuv=False,
)
# Discover a working network interface for gloo device creation
gloo_device = None
try:
for idx, ifname in _socket.if_nameindex():
try:
gloo_device = dist.ProcessGroupGloo.create_device(
interface=ifname
)
print("Try device", idx, "name", ifname)
break
except RuntimeError as e:
print("Try device", idx, "name", ifname, "err:", e)
continue
except (OSError, AttributeError) as e:
print(e.with_traceback(None))
if gloo_device is None:
raise RuntimeError(
"Cannot create gloo device on Windows. "
"No usable network interface found. "
"Try adding your hostname to "
"C:\\Windows\\System32\\drivers\\etc\\hosts "
"with: 127.0.0.1 " + _socket.gethostname()
)
pg_options = dist.ProcessGroupGloo._Options()
pg_options._devices = [gloo_device]
dist.init_process_group(
backend="gloo",
store=store,
world_size=n_gpus,
rank=rank,
pg_options=pg_options,
)
else:
init_url = f"tcp://127.0.0.1:{master_port}"
try: try:
dist.init_process_group( dist.init_process_group(
backend=( backend="nccl",
"gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl" init_method=init_url,
),
init_method="env://",
world_size=n_gpus, world_size=n_gpus,
rank=rank, rank=rank,
) )
except: except:
dist.init_process_group( dist.init_process_group(
backend=( backend="nccl",
"gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl" init_method=init_url + "?use_libuv=False",
),
init_method="env://?use_libuv=False",
world_size=n_gpus, world_size=n_gpus,
rank=rank, rank=rank,
) )
@@ -221,6 +283,7 @@ def run(rank, n_gpus, hps: utils.HParams, logger: logging.Logger):
) )
# net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True) # net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
# net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True) # net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
if use_distributed:
if hasattr(torch, "xpu") and torch.xpu.is_available(): if hasattr(torch, "xpu") and torch.xpu.is_available():
pass pass
elif torch.cuda.is_available(): elif torch.cuda.is_available():