mirror of
https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI.git
synced 2026-06-05 01:10:22 +08:00
fix(train): unsupported gloo device on win
This commit is contained in:
@@ -106,14 +106,19 @@ def main():
|
|||||||
# patch to unblock people without gpus. there is probably a better way.
|
# patch to unblock people without gpus. there is probably a better way.
|
||||||
print("NO GPU DETECTED: falling back to CPU - this may take a while")
|
print("NO GPU DETECTED: falling back to CPU - this may take a while")
|
||||||
n_gpus = 1
|
n_gpus = 1
|
||||||
os.environ["MASTER_ADDR"] = "localhost"
|
|
||||||
os.environ["MASTER_PORT"] = str(randint(20000, 55555))
|
|
||||||
children = []
|
|
||||||
logger = utils.get_logger(hps.model_dir)
|
logger = utils.get_logger(hps.model_dir)
|
||||||
|
if n_gpus == 1:
|
||||||
|
# Single GPU: run directly without distributed to avoid gloo issues on Windows
|
||||||
|
run(0, 1, hps, logger)
|
||||||
|
else:
|
||||||
|
master_port = str(randint(20000, 55555))
|
||||||
|
os.environ["MASTER_ADDR"] = "127.0.0.1"
|
||||||
|
os.environ["MASTER_PORT"] = master_port
|
||||||
|
children = []
|
||||||
for i in range(n_gpus):
|
for i in range(n_gpus):
|
||||||
subproc = mp.Process(
|
subproc = mp.Process(
|
||||||
target=run,
|
target=run,
|
||||||
args=(i, n_gpus, hps, logger),
|
args=(i, n_gpus, hps, logger, master_port),
|
||||||
)
|
)
|
||||||
children.append(subproc)
|
children.append(subproc)
|
||||||
subproc.start()
|
subproc.start()
|
||||||
@@ -122,7 +127,7 @@ def main():
|
|||||||
children[i].join()
|
children[i].join()
|
||||||
|
|
||||||
|
|
||||||
def run(rank, n_gpus, hps: utils.HParams, logger: logging.Logger):
|
def run(rank, n_gpus, hps: utils.HParams, logger: logging.Logger, master_port: str = "29500"):
|
||||||
global global_step
|
global global_step
|
||||||
if rank == 0:
|
if rank == 0:
|
||||||
# logger = utils.get_logger(hps.model_dir)
|
# logger = utils.get_logger(hps.model_dir)
|
||||||
@@ -131,21 +136,78 @@ def run(rank, n_gpus, hps: utils.HParams, logger: logging.Logger):
|
|||||||
writer = SummaryWriter(log_dir=hps.model_dir)
|
writer = SummaryWriter(log_dir=hps.model_dir)
|
||||||
writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
|
writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
|
||||||
|
|
||||||
|
use_distributed = n_gpus > 1
|
||||||
|
if use_distributed:
|
||||||
|
if os.name == "nt" or not torch.cuda.is_available():
|
||||||
|
# On Windows, gloo's create_device(hostname=...) is gated to Linux only
|
||||||
|
# in the C++ layer (makeDeviceForHostname). We must use the interface-
|
||||||
|
# based path instead: create_device(interface=...) calls
|
||||||
|
# makeDeviceForInterface which is not platform-gated.
|
||||||
|
import socket as _socket
|
||||||
|
|
||||||
|
try:
|
||||||
|
store = dist.TCPStore(
|
||||||
|
host_name="127.0.0.1",
|
||||||
|
port=int(master_port),
|
||||||
|
world_size=n_gpus,
|
||||||
|
is_master=(rank == 0),
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
store = dist.TCPStore(
|
||||||
|
host_name="127.0.0.1",
|
||||||
|
port=int(master_port),
|
||||||
|
world_size=n_gpus,
|
||||||
|
is_master=(rank == 0),
|
||||||
|
use_libuv=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Discover a working network interface for gloo device creation
|
||||||
|
gloo_device = None
|
||||||
|
try:
|
||||||
|
for idx, ifname in _socket.if_nameindex():
|
||||||
|
try:
|
||||||
|
gloo_device = dist.ProcessGroupGloo.create_device(
|
||||||
|
interface=ifname
|
||||||
|
)
|
||||||
|
print("Try device", idx, "name", ifname)
|
||||||
|
break
|
||||||
|
except RuntimeError as e:
|
||||||
|
print("Try device", idx, "name", ifname, "err:", e)
|
||||||
|
continue
|
||||||
|
except (OSError, AttributeError) as e:
|
||||||
|
print(e.with_traceback(None))
|
||||||
|
|
||||||
|
if gloo_device is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"Cannot create gloo device on Windows. "
|
||||||
|
"No usable network interface found. "
|
||||||
|
"Try adding your hostname to "
|
||||||
|
"C:\\Windows\\System32\\drivers\\etc\\hosts "
|
||||||
|
"with: 127.0.0.1 " + _socket.gethostname()
|
||||||
|
)
|
||||||
|
|
||||||
|
pg_options = dist.ProcessGroupGloo._Options()
|
||||||
|
pg_options._devices = [gloo_device]
|
||||||
|
dist.init_process_group(
|
||||||
|
backend="gloo",
|
||||||
|
store=store,
|
||||||
|
world_size=n_gpus,
|
||||||
|
rank=rank,
|
||||||
|
pg_options=pg_options,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
init_url = f"tcp://127.0.0.1:{master_port}"
|
||||||
try:
|
try:
|
||||||
dist.init_process_group(
|
dist.init_process_group(
|
||||||
backend=(
|
backend="nccl",
|
||||||
"gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl"
|
init_method=init_url,
|
||||||
),
|
|
||||||
init_method="env://",
|
|
||||||
world_size=n_gpus,
|
world_size=n_gpus,
|
||||||
rank=rank,
|
rank=rank,
|
||||||
)
|
)
|
||||||
except:
|
except:
|
||||||
dist.init_process_group(
|
dist.init_process_group(
|
||||||
backend=(
|
backend="nccl",
|
||||||
"gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl"
|
init_method=init_url + "?use_libuv=False",
|
||||||
),
|
|
||||||
init_method="env://?use_libuv=False",
|
|
||||||
world_size=n_gpus,
|
world_size=n_gpus,
|
||||||
rank=rank,
|
rank=rank,
|
||||||
)
|
)
|
||||||
@@ -221,6 +283,7 @@ def run(rank, n_gpus, hps: utils.HParams, logger: logging.Logger):
|
|||||||
)
|
)
|
||||||
# net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
|
# net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
|
||||||
# net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
|
# net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
|
||||||
|
if use_distributed:
|
||||||
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
||||||
pass
|
pass
|
||||||
elif torch.cuda.is_available():
|
elif torch.cuda.is_available():
|
||||||
|
|||||||
Reference in New Issue
Block a user