From 7c3ecf3ba709c3264e2ffcea616710b22003b0d0 Mon Sep 17 00:00:00 2001 From: "David M. Rogers" Date: Mon, 23 Sep 2024 12:57:19 -0400 Subject: [PATCH] Moved Training out of NeuralNetwork in config. --- hydragnn/utils/config_utils.py | 24 ++++++++++---------- utils/train.py | 41 +++++++++++++++++++++------------- 2 files changed, 37 insertions(+), 28 deletions(-) diff --git a/hydragnn/utils/config_utils.py b/hydragnn/utils/config_utils.py index 80762cc9..db9cf2d4 100644 --- a/hydragnn/utils/config_utils.py +++ b/hydragnn/utils/config_utils.py @@ -92,11 +92,11 @@ def update_config(config, train_loader, val_loader, test_loader): if "initial_bias" not in config["NeuralNetwork"]["Architecture"]: config["NeuralNetwork"]["Architecture"]["initial_bias"] = None - if "Optimizer" not in config["NeuralNetwork"]["Training"]: - config["NeuralNetwork"]["Training"]["Optimizer"]["type"] = "AdamW" + if "Optimizer" not in config["Training"]: + config["Training"]["Optimizer"]["type"] = "AdamW" - if "loss_function_type" not in config["NeuralNetwork"]["Training"]: - config["NeuralNetwork"]["Training"]["loss_function_type"] = "mse" + if "loss_function_type" not in config["Training"]: + config["Training"]["loss_function_type"] = "mse" if "activation_function" not in config["NeuralNetwork"]["Architecture"]: config["NeuralNetwork"]["Architecture"]["activation_function"] = "relu" @@ -104,11 +104,11 @@ def update_config(config, train_loader, val_loader, test_loader): if "SyncBatchNorm" not in config["NeuralNetwork"]["Architecture"]: config["NeuralNetwork"]["Architecture"]["SyncBatchNorm"] = False - if "conv_checkpointing" not in config["NeuralNetwork"]["Training"]: - config["NeuralNetwork"]["Training"]["conv_checkpointing"] = False + if "conv_checkpointing" not in config["Training"]: + config["Training"]["conv_checkpointing"] = False - if "compute_grad_energy" not in config["NeuralNetwork"]["Training"]: - config["NeuralNetwork"]["Training"]["compute_grad_energy"] = False + if "compute_grad_energy" not in config["Training"]: + config["Training"]["compute_grad_energy"] = False return config @@ -256,11 +256,11 @@ def get_log_name_config(config): + "-hd-" + str(config["NeuralNetwork"]["Architecture"]["hidden_dim"]) + "-ne-" - + str(config["NeuralNetwork"]["Training"]["num_epoch"]) + + str(config["Training"]["num_epoch"]) + "-lr-" - + str(config["NeuralNetwork"]["Training"]["Optimizer"]["learning_rate"]) + + str(config["Training"]["Optimizer"]["learning_rate"]) + "-bs-" - + str(config["NeuralNetwork"]["Training"]["batch_size"]) + + str(config["Training"]["batch_size"]) + "-data-" + config["Dataset"]["name"][ : ( @@ -301,7 +301,7 @@ def parse_deepspeed_config(config): ds_config = {} if "train_micro_batch_size_per_gpu" not in ds_config: - ds_config["train_micro_batch_size_per_gpu"] = config["NeuralNetwork"][ + ds_config["train_micro_batch_size_per_gpu"] = config[ "Training" ]["batch_size"] ds_config["gradient_accumulation_steps"] = 1 diff --git a/utils/train.py b/utils/train.py index 65b4a041..8dc674fb 100755 --- a/utils/train.py +++ b/utils/train.py @@ -26,12 +26,13 @@ import hydragnn from hydragnn.utils.print_utils import print_distributed, iterate_tqdm, log from hydragnn.utils.time_utils import Timer -from hydragnn.utils.distdataset import DistDataset -from hydragnn.utils.pickledataset import SimplePickleWriter, SimplePickleDataset -from hydragnn.utils.smiles_utils import ( - get_node_attribute_name, - generate_graphdata_from_smilestr, +#from hydragnn.utils.distdataset import DistDataset +from hydragnn.utils.distributed import ( + setup_ddp, + get_distributed_model, + print_peak_memory, ) + from hydragnn.preprocess.utils import gather_deg from hydragnn.utils import nsplit import hydragnn.utils.tracer as tr @@ -47,12 +48,16 @@ import torch import torch.distributed as dist +from debug_dict import DebugDict def run(argv): assert len(argv) == 3, f"Usage: {argv[0]} " cfgfile = argv[1] dataset = argv[2] + log_name = 'experiment' + (Path('logs')/log_name).mkdir(exist_ok=True, parents=True) + verbosity = 1 tr.initialize() tr.disable() @@ -60,11 +65,12 @@ def run(argv): timer.start() config = json.loads( Path(cfgfile).read_text() ) - print(config) - - comm = MPI.COMM_WORLD - rank = comm.Get_rank() - comm_size = comm.Get_size() + #print(config) + config = DebugDict(config) + #world_size, world_rank = setup_ddp() + comm_size, rank = setup_ddp() + #rank = comm.Get_rank() + #comm_size = comm.Get_size() use_torch_backend = False # Fix to MPI backend if True: # fix to adios format @@ -79,10 +85,11 @@ def run(argv): os.environ["HYDRAGNN_USE_ddstore"] = "1" opt = {"preload": False, "shmem": shmem, "ddstore": ddstore} + comm = MPI.COMM_WORLD trainset = AdiosDataset(dataset, "trainset", comm, **opt) valset = AdiosDataset(dataset, "valset", comm) testset = AdiosDataset(dataset, "testset", comm) - comm.Barrier() + #comm.Barrier() print("Loaded dataset.") info( @@ -98,13 +105,15 @@ def run(argv): ) = hydragnn.preprocess.create_dataloaders( trainset, valset, testset, config["Training"]["Optimizer"]["batch_size"] ) - return 0 - comm.Barrier() + print("Created Dataloaders") + #comm.Barrier() config = hydragnn.utils.update_config(config, train_loader, val_loader, test_loader) - comm.Barrier() + #comm.Barrier() + print("Updated Config") - hydragnn.utils.save_config(config, log_name) + if rank == 0: + hydragnn.utils.save_config(config, log_name) comm.Barrier() timer.stop() @@ -116,7 +125,7 @@ def run(argv): verbosity=verbosity, ) # tell pytorch to parallelize training over torch.distributed - model = hydragnn.utils.get_distributed_model(model, verbosity) + model = get_distributed_model(model, verbosity) learning_rate = config["Training"]["Optimizer"]["learning_rate"] optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)