Skip to content

Commit

Permalink
Merge pull request #251 from romeokienzler/main
Browse files Browse the repository at this point in the history
simplify cos connection into single variable, support target path rewrite
  • Loading branch information
romeokienzler authored Feb 21, 2024
2 parents a6bf09e + 8d7bcd6 commit d4674a7
Showing 1 changed file with 43 additions and 56 deletions.
99 changes: 43 additions & 56 deletions component-library/output/upload-to-cos.ipynb
Original file line number Diff line number Diff line change
@@ -1,13 +1,5 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "af0a5a86-e19a-44e4-963d-0f3c63ba92d6",
"metadata": {},
"source": [
"# output-upload-to-cos"
]
},
{
"cell_type": "markdown",
"id": "53af08cf-0149-4369-93d6-fba2203ec6cc",
Expand All @@ -21,6 +13,14 @@
},
"tags": []
},
"source": [
"# output-upload-to-cos"
]
},
{
"cell_type": "markdown",
"id": "360d1180",
"metadata": {},
"source": [
"Uploads a file to any S3 compliant Cloud Object Storage"
]
Expand All @@ -41,7 +41,7 @@
},
"outputs": [],
"source": [
"!pip install aiobotocore botocore s3fs"
"#!pip install aiobotocore botocore s3fs"
]
},
{
Expand All @@ -64,7 +64,8 @@
"import os\n",
"import sys\n",
"import re\n",
"import s3fs"
"import s3fs\n",
"import glob"
]
},
{
Expand All @@ -83,60 +84,39 @@
},
"outputs": [],
"source": [
"# access key id\n",
"access_key_id = os.environ.get('access_key_id')\n",
"# target in format: cos://access_key_id:secret_access_key@endpoint/bucket/path\n",
"target = os.environ.get('target')\n",
"\n",
"# secret access key\n",
"secret_access_key = os.environ.get('secret_access_key')\n",
"# source folder and file pattern (glob)\n",
"source_file_pattern = os.environ.get('source_file_pattern')\n",
"\n",
"# cos/s3 endpoint\n",
"endpoint = os.environ.get('endpoint')\n",
"# find_recursive, if True, will search for files in subfolders specified in source_file_pattern. Default is True\n",
"find_recursive = bool(os.environ.get('find_recursive','True'))\n",
"\n",
"# cos bucket name\n",
"bucket_name = os.environ.get('bucket_name')\n",
"\n",
"# source file to be uploaded\n",
"source_file = os.environ.get('source_file')\n",
"\n",
"# destination file name\n",
"destination_file = os.environ.get('destination_file')\n",
"\n",
"# temporary data folder\n",
"data_dir = os.environ.get('data_dir', '../../data/')\n",
"\n",
"# dummy_output (to be fixed once C3 supports < 1 outputs)\n",
"output_dummy = os.environ.get('output_dummy', 'output_dummy')"
"# process source file path on target using regex. Default is None\n",
"process_target_file_pattern = os.environ.get('process_target_file_pattern',None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ad8bf902-7582-445b-aea1-a9ce869532ee",
"metadata": {
"papermill": {
"duration": 0.016182,
"end_time": "2022-10-26T08:38:13.987113",
"exception": false,
"start_time": "2022-10-26T08:38:13.970931",
"status": "completed"
},
"tags": []
},
"id": "41952b78",
"metadata": {},
"outputs": [],
"source": [
"parameters = list(\n",
" map(lambda s: re.sub('$', '\"', s),\n",
" map(\n",
" lambda s: s.replace('=', '=\"'),\n",
" filter(\n",
" lambda s: s.find('=') > -1 and bool(re.match(r'[A-Za-z0-9_]*=[.\\/A-Za-z0-9]*', s)),\n",
" sys.argv\n",
" )\n",
" )))\n",
"# TODO externalie into library\n",
"if target.startswith('cos') or target.startswith('s3'):\n",
" buffer=target.split('://')[1]\n",
" access_key_id=buffer.split('@')[0].split(':')[0]\n",
" secret_access_key=buffer.split('@')[0].split(':')[1]\n",
" endpoint=buffer.split('@')[1].split('/')[0]\n",
" target='/'.join(buffer.split('@')[1].split('/')[1:])\n",
"\n",
"\n",
"for parameter in parameters:\n",
" logging.warning('Parameter: ' + parameter)\n",
" exec(parameter)"
" logging.debug(f'access_key_id={access_key_id}')\n",
" logging.debug(f'secret_access_key={secret_access_key}')\n",
" logging.debug(f'endpoint={endpoint}')\n",
" logging.debug(f'target={target}') "
]
},
{
Expand All @@ -159,7 +139,7 @@
" anon=False,\n",
" key=access_key_id,\n",
" secret=secret_access_key,\n",
" client_kwargs={'endpoint_url': endpoint}\n",
" client_kwargs={'endpoint_url': f'https://{endpoint}'}\n",
")"
]
},
Expand All @@ -179,7 +159,14 @@
},
"outputs": [],
"source": [
"s3.put(data_dir + source_file, bucket_name + '/' + destination_file)"
"for source_file in glob.glob(source_file_pattern, recursive=find_recursive):\n",
" target_path = source_file \n",
" if process_target_file_pattern is not None:\n",
" target_path = re.sub(re.compile(process_target_file_pattern), \"\", source_file)\n",
" logging.debug(f'Adjusting target path {source_file} to: {target_path}')\n",
" final_destination = f'{target}{target_path}'.replace('//','/')\n",
" logging.debug(f'Uploading {source_file} to {final_destination}')\n",
" s3.put(source_file, final_destination)"
]
}
],
Expand All @@ -199,7 +186,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.11.7"
},
"papermill": {
"default_parameters": {},
Expand Down

0 comments on commit d4674a7

Please sign in to comment.