Skip to content

Commit

Permalink
Moved Training out of NeuralNetwork in config.
Browse files Browse the repository at this point in the history
  • Loading branch information
frobnitzem committed Sep 23, 2024
1 parent 0434886 commit 7c3ecf3
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 28 deletions.
24 changes: 12 additions & 12 deletions hydragnn/utils/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,23 +92,23 @@ def update_config(config, train_loader, val_loader, test_loader):
if "initial_bias" not in config["NeuralNetwork"]["Architecture"]:
config["NeuralNetwork"]["Architecture"]["initial_bias"] = None

if "Optimizer" not in config["NeuralNetwork"]["Training"]:
config["NeuralNetwork"]["Training"]["Optimizer"]["type"] = "AdamW"
if "Optimizer" not in config["Training"]:
config["Training"]["Optimizer"]["type"] = "AdamW"

if "loss_function_type" not in config["NeuralNetwork"]["Training"]:
config["NeuralNetwork"]["Training"]["loss_function_type"] = "mse"
if "loss_function_type" not in config["Training"]:
config["Training"]["loss_function_type"] = "mse"

if "activation_function" not in config["NeuralNetwork"]["Architecture"]:
config["NeuralNetwork"]["Architecture"]["activation_function"] = "relu"

if "SyncBatchNorm" not in config["NeuralNetwork"]["Architecture"]:
config["NeuralNetwork"]["Architecture"]["SyncBatchNorm"] = False

if "conv_checkpointing" not in config["NeuralNetwork"]["Training"]:
config["NeuralNetwork"]["Training"]["conv_checkpointing"] = False
if "conv_checkpointing" not in config["Training"]:
config["Training"]["conv_checkpointing"] = False

if "compute_grad_energy" not in config["NeuralNetwork"]["Training"]:
config["NeuralNetwork"]["Training"]["compute_grad_energy"] = False
if "compute_grad_energy" not in config["Training"]:
config["Training"]["compute_grad_energy"] = False
return config


Expand Down Expand Up @@ -256,11 +256,11 @@ def get_log_name_config(config):
+ "-hd-"
+ str(config["NeuralNetwork"]["Architecture"]["hidden_dim"])
+ "-ne-"
+ str(config["NeuralNetwork"]["Training"]["num_epoch"])
+ str(config["Training"]["num_epoch"])
+ "-lr-"
+ str(config["NeuralNetwork"]["Training"]["Optimizer"]["learning_rate"])
+ str(config["Training"]["Optimizer"]["learning_rate"])
+ "-bs-"
+ str(config["NeuralNetwork"]["Training"]["batch_size"])
+ str(config["Training"]["batch_size"])
+ "-data-"
+ config["Dataset"]["name"][
: (
Expand Down Expand Up @@ -301,7 +301,7 @@ def parse_deepspeed_config(config):
ds_config = {}

if "train_micro_batch_size_per_gpu" not in ds_config:
ds_config["train_micro_batch_size_per_gpu"] = config["NeuralNetwork"][
ds_config["train_micro_batch_size_per_gpu"] = config[
"Training"
]["batch_size"]
ds_config["gradient_accumulation_steps"] = 1
Expand Down
41 changes: 25 additions & 16 deletions utils/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,13 @@
import hydragnn
from hydragnn.utils.print_utils import print_distributed, iterate_tqdm, log
from hydragnn.utils.time_utils import Timer
from hydragnn.utils.distdataset import DistDataset
from hydragnn.utils.pickledataset import SimplePickleWriter, SimplePickleDataset
from hydragnn.utils.smiles_utils import (
get_node_attribute_name,
generate_graphdata_from_smilestr,
#from hydragnn.utils.distdataset import DistDataset
from hydragnn.utils.distributed import (
setup_ddp,
get_distributed_model,
print_peak_memory,
)

from hydragnn.preprocess.utils import gather_deg
from hydragnn.utils import nsplit
import hydragnn.utils.tracer as tr
Expand All @@ -47,24 +48,29 @@
import torch
import torch.distributed as dist

from debug_dict import DebugDict

def run(argv):
assert len(argv) == 3, f"Usage: {argv[0]} <config.json> <dataset.bp>"

cfgfile = argv[1]
dataset = argv[2]
log_name = 'experiment'
(Path('logs')/log_name).mkdir(exist_ok=True, parents=True)
verbosity = 1

tr.initialize()
tr.disable()
timer = Timer("load_data")
timer.start()

config = json.loads( Path(cfgfile).read_text() )
print(config)

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
comm_size = comm.Get_size()
#print(config)
config = DebugDict(config)
#world_size, world_rank = setup_ddp()
comm_size, rank = setup_ddp()
#rank = comm.Get_rank()
#comm_size = comm.Get_size()

use_torch_backend = False # Fix to MPI backend
if True: # fix to adios format
Expand All @@ -79,10 +85,11 @@ def run(argv):
os.environ["HYDRAGNN_USE_ddstore"] = "1"

opt = {"preload": False, "shmem": shmem, "ddstore": ddstore}
comm = MPI.COMM_WORLD
trainset = AdiosDataset(dataset, "trainset", comm, **opt)
valset = AdiosDataset(dataset, "valset", comm)
testset = AdiosDataset(dataset, "testset", comm)
comm.Barrier()
#comm.Barrier()

print("Loaded dataset.")
info(
Expand All @@ -98,13 +105,15 @@ def run(argv):
) = hydragnn.preprocess.create_dataloaders(
trainset, valset, testset, config["Training"]["Optimizer"]["batch_size"]
)
return 0
comm.Barrier()
print("Created Dataloaders")
#comm.Barrier()

config = hydragnn.utils.update_config(config, train_loader, val_loader, test_loader)
comm.Barrier()
#comm.Barrier()
print("Updated Config")

hydragnn.utils.save_config(config, log_name)
if rank == 0:
hydragnn.utils.save_config(config, log_name)
comm.Barrier()

timer.stop()
Expand All @@ -116,7 +125,7 @@ def run(argv):
verbosity=verbosity,
)
# tell pytorch to parallelize training over torch.distributed
model = hydragnn.utils.get_distributed_model(model, verbosity)
model = get_distributed_model(model, verbosity)

learning_rate = config["Training"]["Optimizer"]["learning_rate"]
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
Expand Down

0 comments on commit 7c3ecf3

Please sign in to comment.