1
0
mirror of https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git synced 2026-06-05 01:10:22 +08:00

fix(train): unsupported gloo device on win

This commit is contained in:
源文雨
2026-04-18 17:30:48 +08:00
parent cc50ede4fb
commit 3affc9415d

View File

@@ -106,23 +106,28 @@ def main():
# patch to unblock people without gpus. there is probably a better way.
print("NO GPU DETECTED: falling back to CPU - this may take a while")
n_gpus = 1
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = str(randint(20000, 55555))
children = []
logger = utils.get_logger(hps.model_dir)
for i in range(n_gpus):
subproc = mp.Process(
target=run,
args=(i, n_gpus, hps, logger),
)
children.append(subproc)
subproc.start()
if n_gpus == 1:
# Single GPU: run directly without distributed to avoid gloo issues on Windows
run(0, 1, hps, logger)
else:
master_port = str(randint(20000, 55555))
os.environ["MASTER_ADDR"] = "127.0.0.1"
os.environ["MASTER_PORT"] = master_port
children = []
for i in range(n_gpus):
subproc = mp.Process(
target=run,
args=(i, n_gpus, hps, logger, master_port),
)
children.append(subproc)
subproc.start()
for i in range(n_gpus):
children[i].join()
for i in range(n_gpus):
children[i].join()
def run(rank, n_gpus, hps: utils.HParams, logger: logging.Logger):
def run(rank, n_gpus, hps: utils.HParams, logger: logging.Logger, master_port: str = "29500"):
global global_step
if rank == 0:
# logger = utils.get_logger(hps.model_dir)
@@ -131,24 +136,81 @@ def run(rank, n_gpus, hps: utils.HParams, logger: logging.Logger):
writer = SummaryWriter(log_dir=hps.model_dir)
writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
try:
dist.init_process_group(
backend=(
"gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl"
),
init_method="env://",
world_size=n_gpus,
rank=rank,
)
except:
dist.init_process_group(
backend=(
"gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl"
),
init_method="env://?use_libuv=False",
world_size=n_gpus,
rank=rank,
)
use_distributed = n_gpus > 1
if use_distributed:
if os.name == "nt" or not torch.cuda.is_available():
# On Windows, gloo's create_device(hostname=...) is gated to Linux only
# in the C++ layer (makeDeviceForHostname). We must use the interface-
# based path instead: create_device(interface=...) calls
# makeDeviceForInterface which is not platform-gated.
import socket as _socket
try:
store = dist.TCPStore(
host_name="127.0.0.1",
port=int(master_port),
world_size=n_gpus,
is_master=(rank == 0),
)
except Exception:
store = dist.TCPStore(
host_name="127.0.0.1",
port=int(master_port),
world_size=n_gpus,
is_master=(rank == 0),
use_libuv=False,
)
# Discover a working network interface for gloo device creation
gloo_device = None
try:
for idx, ifname in _socket.if_nameindex():
try:
gloo_device = dist.ProcessGroupGloo.create_device(
interface=ifname
)
print("Try device", idx, "name", ifname)
break
except RuntimeError as e:
print("Try device", idx, "name", ifname, "err:", e)
continue
except (OSError, AttributeError) as e:
print(e.with_traceback(None))
if gloo_device is None:
raise RuntimeError(
"Cannot create gloo device on Windows. "
"No usable network interface found. "
"Try adding your hostname to "
"C:\\Windows\\System32\\drivers\\etc\\hosts "
"with: 127.0.0.1 " + _socket.gethostname()
)
pg_options = dist.ProcessGroupGloo._Options()
pg_options._devices = [gloo_device]
dist.init_process_group(
backend="gloo",
store=store,
world_size=n_gpus,
rank=rank,
pg_options=pg_options,
)
else:
init_url = f"tcp://127.0.0.1:{master_port}"
try:
dist.init_process_group(
backend="nccl",
init_method=init_url,
world_size=n_gpus,
rank=rank,
)
except:
dist.init_process_group(
backend="nccl",
init_method=init_url + "?use_libuv=False",
world_size=n_gpus,
rank=rank,
)
torch.manual_seed(hps.train.seed)
if torch.cuda.is_available():
torch.cuda.set_device(rank)
@@ -221,14 +283,15 @@ def run(rank, n_gpus, hps: utils.HParams, logger: logging.Logger):
)
# net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
# net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
if hasattr(torch, "xpu") and torch.xpu.is_available():
pass
elif torch.cuda.is_available():
net_g = DDP(net_g, device_ids=[rank])
net_d = DDP(net_d, device_ids=[rank])
else:
net_g = DDP(net_g)
net_d = DDP(net_d)
if use_distributed:
if hasattr(torch, "xpu") and torch.xpu.is_available():
pass
elif torch.cuda.is_available():
net_g = DDP(net_g, device_ids=[rank])
net_d = DDP(net_d, device_ids=[rank])
else:
net_g = DDP(net_g)
net_d = DDP(net_d)
try: # 如果能加载自动resume
_, _, _, epoch_str = utils.load_checkpoint(