Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Azure checkpointing support #2893

Merged
merged 39 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
5d2c3ab
v1
mvpatel2000 Jan 22, 2024
4fbbf92
fix
mvpatel2000 Jan 22, 2024
76bdeff
fix
mvpatel2000 Jan 22, 2024
426ca22
logs
mvpatel2000 Jan 22, 2024
aea68a3
dump env
mvpatel2000 Jan 22, 2024
420bc26
fix
mvpatel2000 Jan 22, 2024
4955ee8
logs
mvpatel2000 Jan 22, 2024
9c73807
force logs
mvpatel2000 Jan 22, 2024
46bd6b8
bucket support
mvpatel2000 Jan 22, 2024
8c4e8d2
typo
mvpatel2000 Jan 22, 2024
f71b076
more logs
mvpatel2000 Jan 22, 2024
3d918fd
logs
mvpatel2000 Jan 22, 2024
8031951
more logs
mvpatel2000 Jan 22, 2024
c031828
fix autoresume
mvpatel2000 Jan 22, 2024
abaf0c1
logs
mvpatel2000 Jan 22, 2024
24bde72
fix
mvpatel2000 Jan 22, 2024
d5517a6
fix
mvpatel2000 Jan 22, 2024
b597478
lint
mvpatel2000 Jan 22, 2024
31b8958
morelogs
mvpatel2000 Jan 22, 2024
aa63a3f
logs
mvpatel2000 Jan 22, 2024
6dd56d8
fix autoresume
mvpatel2000 Jan 22, 2024
49abf41
fix
mvpatel2000 Jan 22, 2024
6991777
lint
mvpatel2000 Jan 22, 2024
00cf484
fix
mvpatel2000 Jan 22, 2024
90f72ae
fix lstirp
mvpatel2000 Jan 23, 2024
e794996
strip prefix
mvpatel2000 Jan 23, 2024
87818a2
muck around
mvpatel2000 Jan 23, 2024
4a59a8a
logs
mvpatel2000 Jan 23, 2024
4c79e7e
azure
mvpatel2000 Jan 23, 2024
98bef9b
timestamp
mvpatel2000 Jan 23, 2024
d647da5
fix
mvpatel2000 Jan 23, 2024
583c4f6
state
mvpatel2000 Jan 23, 2024
57b9162
logs
mvpatel2000 Jan 23, 2024
e1721b6
Merge branch 'dev' into mvpatel2000/azure
mvpatel2000 Jan 23, 2024
22e5c4d
logs
mvpatel2000 Jan 23, 2024
80723f9
remove
mvpatel2000 Jan 23, 2024
6bea411
game
mvpatel2000 Jan 23, 2024
77b1760
fix
mvpatel2000 Jan 23, 2024
4cc7521
lint
mvpatel2000 Jan 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion composer/loggers/remote_uploader_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class RemoteUploaderDownloader(LoggerDestination):
backend_kwargs={
'provider': 's3',
'container': 'my-bucket',
'provider_kwargs=': {
'provider_kwargs': {
'key': 'AKIA...',
'secret': '*********',
'region': 'ap-northeast-1',
Expand Down
2 changes: 1 addition & 1 deletion composer/utils/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,7 +641,7 @@ def download_checkpoint(path: str,
raise FileNotFoundError(
(f'Checkpoint {_format_path_with_current_rank(path)} does not exist, '
f'but is required for sharded checkpointing on rank {dist.get_global_rank()}. '
'Please ensure that the checkpoint exists and your load_path was specified as a format string'
'Please ensure that the checkpoint exists and your load_path was specified as a format string '
'with the {rank} argument.')) from e

if extracted_checkpoint_folder is not None:
Expand Down
25 changes: 20 additions & 5 deletions composer/utils/file_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
from composer.utils import dist
from composer.utils.iter_helpers import iterate_with_callback
from composer.utils.misc import partial_format
from composer.utils.object_store import (GCSObjectStore, MLFlowObjectStore, ObjectStore, OCIObjectStore, S3ObjectStore,
UCObjectStore)
from composer.utils.object_store import (GCSObjectStore, LibcloudObjectStore, MLFlowObjectStore, ObjectStore,
OCIObjectStore, S3ObjectStore, UCObjectStore)
from composer.utils.object_store.mlflow_object_store import MLFLOW_DBFS_PATH_PREFIX

if TYPE_CHECKING:
Expand Down Expand Up @@ -319,6 +319,7 @@ def parse_uri(uri: str) -> Tuple[str, str, str]:
Tuple[str, str, str]: A tuple containing the backend (e.g. s3), bucket name, and path.
Backend name will be empty string if the input is a local path
"""
uri = uri.replace('AZURE_BLOBS', 'azure') # urlparse does not support _ in scheme
parse_result = urlparse(uri)
backend, net_loc, path = parse_result.scheme, parse_result.netloc, parse_result.path
bucket_name = net_loc if '@' not in net_loc else net_loc.split('@')[0]
Expand Down Expand Up @@ -354,6 +355,13 @@ def maybe_create_object_store_from_uri(uri: str) -> Optional[ObjectStore]:
return GCSObjectStore(bucket=bucket_name)
elif backend == 'oci':
return OCIObjectStore(bucket=bucket_name)
elif backend == 'azure':
return LibcloudObjectStore(
provider='AZURE_BLOBS',
container=bucket_name,
key_environ='AZURE_ACCOUNT_NAME',
secret_environ='AZURE_ACCOUNT_ACCESS_KEY',
)
elif backend == 'dbfs':
if path.startswith(MLFLOW_DBFS_PATH_PREFIX):
store = None
Expand Down Expand Up @@ -411,14 +419,21 @@ def maybe_create_remote_uploader_downloader_from_uri(
return None
if backend in ['s3', 'oci', 'gs']:
return RemoteUploaderDownloader(bucket_uri=f'{backend}://{bucket_name}')

elif backend == 'azure':
return RemoteUploaderDownloader(
bucket_uri=f'libcloud://{bucket_name}',
backend_kwargs={
'provider': 'AZURE_BLOBS',
'container': bucket_name,
'key_environ': 'AZURE_ACCOUNT_NAME',
'secret_environ': 'AZURE_ACCOUNT_ACCESS_KEY',
},
)
elif backend == 'dbfs':
return RemoteUploaderDownloader(bucket_uri=uri, backend_kwargs={'path': path})

elif backend == 'wandb':
raise NotImplementedError(f'There is no implementation for WandB via URI. Please use '
'WandBLogger with log_artifacts set to True')

else:
raise NotImplementedError(f'There is no implementation for the cloud backend {backend} via URI. Please use '
'one of the supported RemoteUploaderDownloader object stores')
Expand Down
Loading