-
Notifications
You must be signed in to change notification settings - Fork 28
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
added checkpointing to support LLMs #114
Changes from 15 commits
557afed
8dd835e
0905c36
c50f6fd
146ce85
381880c
db73681
5cb6d0e
0564dfe
1b0d42e
52011b4
6221b33
03796ad
900cb53
7ade86b
96eefa2
b73cea5
fc42c86
0c058ce
3f28662
7fdead2
189b2e4
8209980
baf023f
63ff8c4
a397d11
3727e5a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
model: dlrm | ||
|
||
framework: pytorch | ||
|
||
workflow: | ||
generate_data: False | ||
train: True | ||
do_eval: True | ||
|
||
dataset: | ||
data_folder: data/dlrm | ||
format: indexed_binary | ||
num_files_train: 1 | ||
num_files_eval: 1 | ||
num_samples_per_file: 4195198976 | ||
record_length: 327680 | ||
keep_files: True | ||
eval_num_samples_per_file: 91681240 | ||
|
||
reader: | ||
data_loader: pytorch | ||
batch_size: 2048 | ||
batch_size_eval: 16384 | ||
sample_shuffle: random | ||
|
||
train: | ||
epochs: 1 | ||
computation_time: 0.064296 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the computation time from running the real workload? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is based on the configuration used in the PR #88 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have to validate this after merging the PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed |
||
total_training_steps: 32768 | ||
total_eval_steps: 2048 | ||
|
||
evaluation: | ||
eval_time: 0.0843 | ||
steps_between_evals: 16384 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
model: unet3d | ||
|
||
framework: pytorch | ||
|
||
workflow: | ||
generate_data: False | ||
train: True | ||
checkpoint: True | ||
|
||
dataset: | ||
data_folder: dataset/megatron-deepspeed/ | ||
format: mmap_indexed_binary | ||
num_files_train: 1 | ||
num_samples_per_file: 277203535 | ||
record_length: 2048 | ||
|
||
reader: | ||
data_loader: pytorch | ||
batch_size: 1024 | ||
read_threads: 1 | ||
file_shuffle: seed | ||
sample_shuffle: seed | ||
|
||
train: | ||
epochs: 311541 | ||
computation_time: 8.99 | ||
|
||
checkpoint: | ||
checkpoint_folder: checkpoints/megatron-deepspeed | ||
steps_between_checkpoints: 1000 | ||
model_size: 30102 | ||
type: independent | ||
optimization_groups: [1009254400, 865075200, 793600] | ||
num_layers: 44 | ||
layer_parameters: [129761280, 20971520] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
""" | ||
Copyright (c) 2022, UChicago Argonne, LLC | ||
All Rights Reserved | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
""" | ||
|
||
from dlio_benchmark.common.enumerations import Compression | ||
from dlio_benchmark.data_generator.data_generator import DataGenerator | ||
|
||
import logging | ||
import numpy as np | ||
|
||
from dlio_benchmark.utils.utility import progress, utcnow | ||
from dlio_profiler.logger import fn_interceptor as Profile | ||
from shutil import copyfile | ||
from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR | ||
import struct | ||
|
||
dlp = Profile(MODULE_DATA_GENERATOR) | ||
|
||
""" | ||
Generator for creating data in NPZ format. | ||
""" | ||
class IndexedBinaryGenerator(DataGenerator): | ||
def __init__(self): | ||
super().__init__() | ||
|
||
def index_file_path_off(self, prefix_path): | ||
return prefix_path + '.off.idx' | ||
|
||
def index_file_path_size(self, prefix_path): | ||
return prefix_path + '.sz.idx' | ||
|
||
@dlp.log | ||
def generate(self): | ||
""" | ||
Generator for creating data in NPZ format of 3d dataset. | ||
""" | ||
super().generate() | ||
np.random.seed(10) | ||
GB=1024**3 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please change GB=1073741824. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FIXED |
||
for i in dlp.iter(range(self.my_rank, int(self.total_files_to_generate), self.comm_size)): | ||
dim1, dim2 = self.get_dimension() | ||
sample_size = dim1 * dim2 | ||
total_size = sample_size * self.num_samples | ||
write_size = total_size | ||
MEMORY_SIZE = 2*GB | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we allow user to configure this using environment variable, with a default value of 2GB? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. under dataset, I will add a configuration called generation_buffer_size. What do you think? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
if total_size > MEMORY_SIZE: | ||
write_size = MEMORY_SIZE - (MEMORY_SIZE % sample_size) | ||
out_path_spec = self.storage.get_uri(self._file_list[i]) | ||
out_path_spec_off_idx = self.index_file_path_off(out_path_spec) | ||
out_path_spec_sz_idx = self.index_file_path_size(out_path_spec) | ||
progress(i + 1, self.total_files_to_generate, "Generating Indexed Binary Data") | ||
prev_out_spec = out_path_spec | ||
written_bytes = 0 | ||
data_file = open(out_path_spec, "wb") | ||
off_file = open(out_path_spec_off_idx, "wb") | ||
sz_file = open(out_path_spec_sz_idx, "wb") | ||
records = np.random.randint(255, size=write_size, dtype=np.uint8) | ||
while written_bytes < total_size: | ||
data_to_write = write_size if written_bytes + write_size <= total_size else total_size - written_bytes | ||
samples_to_write = data_to_write // sample_size | ||
|
||
# Write data | ||
myfmt = 'B' * data_to_write | ||
binary_data = struct.pack(myfmt, *records[:data_to_write]) | ||
data_file.write(binary_data) | ||
|
||
# Write offsets | ||
myfmt = 'Q' * samples_to_write | ||
offsets = range(0, data_to_write, sample_size) | ||
offsets = offsets[:samples_to_write] | ||
binary_offsets = struct.pack(myfmt, *offsets) | ||
off_file.write(binary_offsets) | ||
|
||
# Write sizes | ||
myfmt = 'Q' * samples_to_write | ||
sample_sizes = [sample_size] * samples_to_write | ||
binary_sizes = struct.pack(myfmt, *sample_sizes) | ||
sz_file.write(binary_sizes) | ||
|
||
written_bytes = written_bytes + data_to_write | ||
data_file.close() | ||
off_file.close() | ||
sz_file.close() | ||
np.random.seed() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe rename this is IOType instead of CheckpointType?
Check point looks like more different kinds of checkpoint? We can use it as for example, only checkpoint model, optimization state, etc.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about CheckpointIOType Just IOType might confuse with Reading.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I named its CheckpointLocationType as RANK_ZERO or ALL_RANKS