Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implemented follow-symlinks feature for S3. #854

Merged
merged 3 commits into from
Jul 31, 2014
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions awscli/customizations/s3/description.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,18 @@ def add_param_descriptions(params_dict):
params_dict['delete']['documents'] = "Files that exist in the " \
"destination but not in the source are deleted during sync."

params_dict['follow-symlinks']['documents'] = "Symbolic links are " \
"followed only when uploading to S3 from the local filesystem. Note" \
" that S3 does not support symbolic links, so the contents of the link" \
" target are uploaded under the name of the link. When neither " \
"``--follow-symlinks`` nor ``--no-follow-symlinks`` is specifed, " \
" the default is to follow symlinks."

params_dict['no-follow-symlinks']['documents'] = "Symbolic links are " \
"ignored when uploading to S3 from the local filesystem. When neither " \
"``--follow-symlinks`` nor ``--no-follow-symlinks`` is specifed, " \
" the default is to follow symlinks."

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Re-wording suggestion:

Symbolic links are followed only when uploading to S3 from the local filesystem. Note that S3 does not support symbolic links, so the contents of the link target are uploaded under the name of the link.

params_dict['exclude']['documents'] = "Exclude all files or objects" \
" from the command that matches the specified pattern."

Expand Down
88 changes: 55 additions & 33 deletions awscli/customizations/s3/filegenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class FileDecodingError(Exception):
ADVICE = (
"Please check your locale settings. The filename was decoded as: %s\n"
"On posix platforms, check the LC_CTYPE environment variable."
% (sys.getfilesystemencoding())
% (sys.getfilesystemencoding())
)

def __init__(self, directory, filename):
Expand All @@ -58,6 +58,7 @@ def __init__(self, service, endpoint, operation_name, parameters):
self._service = service
self._endpoint = endpoint
self.operation_name = operation_name
self.parameters = parameters

def call(self, files):
"""
Expand Down Expand Up @@ -103,38 +104,42 @@ def list_files(self, path, dir_op):
"""
join, isdir, isfile = os.path.join, os.path.isdir, os.path.isfile
error, listdir = os.error, os.listdir
if not dir_op:
size, last_update = get_file_stat(path)
yield path, size, last_update
else:
# We need to list files in byte order based on the full
# expanded path of the key: 'test/1/2/3.txt' However, listdir()
# will only give us contents a single directory at a time, so we'll
# get 'test'. At the same time we don't want to load the entire
# list of files into memory. This is handled by first going
# through the current directory contents and adding the directory
# separator to any directories. We can then sort the contents,
# and ensure byte order.
names = listdir(path)
self._check_paths_decoded(path, names)
for i, name in enumerate(names):
file_path = join(path, name)
if isdir(file_path):
names[i] = name + os.path.sep
names.sort()
for name in names:
file_path = join(path, name)
if isdir(file_path):
# Anything in a directory will have a prefix of this
# current directory and will come before the
# remaining contents in this directory. This means we need
# to recurse into this sub directory before yielding the
# rest of this directory's contents.
for x in self.list_files(file_path, dir_op):
yield x
else:
size, last_update = get_file_stat(file_path)
yield file_path, size, last_update
if not self.check_ignore_file(path):
if not dir_op:
size, last_update = get_file_stat(path)
yield path, size, last_update
else:
# We need to list files in byte order based on the full
# expanded path of the key: 'test/1/2/3.txt' However,
# listdir() will only give us contents a single directory
# at a time, so we'll get 'test'. At the same time we don't
# want to load the entire list of files into memory. This
# is handled by first going through the current directory
# contents and adding the directory separator to any
# directories. We can then sort the contents,
# and ensure byte order.
names = listdir(path)
self._check_paths_decoded(path, names)
for i, name in enumerate(names):
file_path = join(path, name)
if isdir(file_path):
names[i] = name + os.path.sep
names.sort()
for name in names:
file_path = join(path, name)
if not self.check_ignore_file(file_path):
if isdir(file_path):
# Anything in a directory will have a prefix of
# this current directory and will come before the
# remaining contents in this directory. This
# means we need to recurse into this sub directory
# before yielding the rest of this directory's
# contents.
for x in self.list_files(file_path, dir_op):
yield x
else:
size, last_update = get_file_stat(file_path)
yield file_path, size, last_update

def _check_paths_decoded(self, path, names):
# We can get a UnicodeDecodeError if we try to listdir(<unicode>) and
Expand All @@ -147,6 +152,23 @@ def _check_paths_decoded(self, path, names):
if not isinstance(name, six.text_type):
raise FileDecodingError(path, name)

def check_ignore_file(self, path):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

given this is a function that returns true/false, a name like should_ignore_file is easier to follow.

"""
This function checks whether a file should be ignored in the
file generation process. This include files that do not exists
(i.e. broken symlinks) and symlinks that are not to be followed.
"""
if not os.path.exists(path):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an interesting case. I would expect that if a user asks us to follow symlinks and we encounter a symlink that's bad, we should be warning the user about this and return with a non-zero RC because not everything the customer asked for was synced.

In the simplest scenario, trying to copy a single file bad symlink seems counterintuitive:

$ ln -s /tmp/does/not/exist bad-symlink

# It gives the appearance that it works:
$ aws s3 cp bad-symlink s3://<bucket>/bad-symlink/
$ echo $?
0
# But it's not actually there.
$ aws s3 ls s3://<bucket>/bad-symlink

What do you think?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I have been thinking about this as well. The current implementation skips broken symbolic links via the os.path.exists() call in check_ignore_file() just so the whole process does not fail if it tried to read a bad symlink for now. The RC code currently only returns 0 (success) or 1 (fail) based on how many operations failed right? So we could add another feature that both generates a RC based on the result (i.e. success, warning, failure) and prints out at the end a particular warning, the cause of the warning, and what file caused the warning. This would be useful for future fixes like handling socket files, pipe file, etc. Thoughts?

return True
follow_symlinks = self.parameters.get('follow_symlinks', True)
if not follow_symlinks:
if os.path.isdir(path):
# Trailing slash must be removed to check if it is a symlink.
path = path[:-1]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if we can always count on the invariant that os.path.isdir(path) will always end with a trailing slash. I think it would be safer to also check this condition: if os.path.isdir(path) and path.endswith(os.sep):

if os.path.islink(path):
return True
return False

def list_objects(self, s3_path, dir_op):
"""
This function yields the appropriate object or objects under a
Expand Down
77 changes: 44 additions & 33 deletions awscli/customizations/s3/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,9 @@ def _create_subcommand_table(self):
self._session.emit('building-operation-table.%s' % self._name,
operation_table=subcommand_table,
session=self._session)
subcommand_table['help'] = S3HelpCommand(self._session, self,
command_table=subcommand_table,
arg_table=None)
subcommand_table['help'] = \
S3HelpCommand(self._session, self,
command_table=subcommand_table, arg_table=None)
return subcommand_table

def _get_command_usage(self, cmd_class):
Expand Down Expand Up @@ -312,9 +312,9 @@ def _create_parameter_table(self):
def _populate_parameter_table(self):
parameter_table = {}
for param in CMD_DICT[self._name]['params']:
parameter_table[param] = S3Parameter(param,
PARAMS_DICT[param]['options'],
PARAMS_DICT[param]['documents'])
parameter_table[param] = \
S3Parameter(param, PARAMS_DICT[param]['options'],
PARAMS_DICT[param]['documents'])
return parameter_table

def _build_call_parameters(self, args, service_params):
Expand Down Expand Up @@ -414,7 +414,8 @@ def _make_last_mod_str(self, last_mod):
last_mod = parse(last_mod)
last_mod = last_mod.astimezone(tzlocal())
last_mod_tup = (str(last_mod.year), str(last_mod.month).zfill(2),
str(last_mod.day).zfill(2), str(last_mod.hour).zfill(2),
str(last_mod.day).zfill(2),
str(last_mod.hour).zfill(2),
str(last_mod.minute).zfill(2),
str(last_mod.second).zfill(2))
last_mod_str = "%s-%s-%s %s:%s:%s" % last_mod_tup
Expand Down Expand Up @@ -445,9 +446,11 @@ def _do_command(self, parsed_args, parsed_globals):
def _build_website_configuration(self, parsed_args):
website_config = {}
if parsed_args.index_document is not None:
website_config['IndexDocument'] = {'Suffix': parsed_args.index_document}
website_config['IndexDocument'] = \
{'Suffix': parsed_args.index_document}
if parsed_args.error_document is not None:
website_config['ErrorDocument'] = {'Key': parsed_args.error_document}
website_config['ErrorDocument'] = \
{'Key': parsed_args.error_document}
return website_config

def _get_bucket_name(self, path):
Expand Down Expand Up @@ -575,36 +578,35 @@ def run(self):
command_dict = {}
if self.cmd == 'sync':
command_dict = {'setup': [files, rev_files],
'file_generator': [file_generator,
rev_generator],
'filters': [create_filter(self.parameters),
create_filter(self.parameters)],
'comparator': [Comparator(self.parameters)],
's3_handler': [s3handler]}
'file_generator': [file_generator,
rev_generator],
'filters': [create_filter(self.parameters),
create_filter(self.parameters)],
'comparator': [Comparator(self.parameters)],
's3_handler': [s3handler]}
elif self.cmd == 'cp':
command_dict = {'setup': [files],
'file_generator': [file_generator],
'filters': [create_filter(self.parameters)],
's3_handler': [s3handler]}
'file_generator': [file_generator],
'filters': [create_filter(self.parameters)],
's3_handler': [s3handler]}
elif self.cmd == 'rm':
command_dict = {'setup': [files],
'file_generator': [file_generator],
'filters': [create_filter(self.parameters)],
's3_handler': [s3handler]}
'file_generator': [file_generator],
'filters': [create_filter(self.parameters)],
's3_handler': [s3handler]}
elif self.cmd == 'mv':
command_dict = {'setup': [files],
'file_generator': [file_generator],
'filters': [create_filter(self.parameters)],
's3_handler': [s3handler]}
'file_generator': [file_generator],
'filters': [create_filter(self.parameters)],
's3_handler': [s3handler]}
elif self.cmd == 'mb':
command_dict = {'setup': [taskinfo],
's3_handler': [s3handler]}
's3_handler': [s3handler]}
elif self.cmd == 'rb':
command_dict = {'setup': [taskinfo],
's3_handler': [s3handler]}
's3_handler': [s3handler]}

files = command_dict['setup']

while self.instructions:
instruction = self.instructions.pop(0)
file_list = []
Expand Down Expand Up @@ -761,7 +763,8 @@ def check_force(self, parsed_globals):
bucket = find_bucket_key(self.parameters['src'][5:])[0]
path = 's3://' + bucket
try:
del_objects = S3SubCommand('rm', self.session, {'nargs': 1})
del_objects = S3SubCommand('rm', self.session,
{'nargs': 1})
del_objects([path, '--recursive'], parsed_globals)
except:
pass
Expand All @@ -774,7 +777,8 @@ def add_endpoint_url(self, parsed_globals):
Adds endpoint_url to the parameters.
"""
if 'endpoint_url' in parsed_globals:
self.parameters['endpoint_url'] = getattr(parsed_globals, 'endpoint_url')
self.parameters['endpoint_url'] = getattr(parsed_globals,
'endpoint_url')
else:
self.parameters['endpoint_url'] = None

Expand All @@ -788,16 +792,17 @@ def add_verify_ssl(self, parsed_globals):
# keys for help command and doc generation.
CMD_DICT = {'cp': {'options': {'nargs': 2},
'params': ['dryrun', 'quiet', 'recursive',
'include', 'exclude', 'acl',
'no-guess-mime-type',
'include', 'exclude', 'acl', 'follow-symlinks',
'no-follow-symlinks', 'no-guess-mime-type',
'sse', 'storage-class', 'grants',
'website-redirect', 'content-type',
'cache-control', 'content-disposition',
'content-encoding', 'content-language',
'expires']},
'mv': {'options': {'nargs': 2},
'params': ['dryrun', 'quiet', 'recursive',
'include', 'exclude', 'acl',
'include', 'exclude', 'acl', 'follow-symlinks',
'no-follow-symlinks',
'sse', 'storage-class', 'grants',
'website-redirect', 'content-type',
'cache-control', 'content-disposition',
Expand All @@ -809,7 +814,8 @@ def add_verify_ssl(self, parsed_globals):
'sync': {'options': {'nargs': 2},
'params': ['dryrun', 'delete', 'exclude',
'include', 'quiet', 'acl', 'grants',
'no-guess-mime-type',
'no-guess-mime-type', 'follow-symlinks',
'no-follow-symlinks',
'sse', 'storage-class', 'content-type',
'cache-control', 'content-disposition',
'content-encoding', 'content-language',
Expand All @@ -836,6 +842,11 @@ def add_verify_ssl(self, parsed_globals):
'delete': {'options': {'action': 'store_true'}},
'quiet': {'options': {'action': 'store_true'}},
'force': {'options': {'action': 'store_true'}},
'follow-symlinks': {'options': {'action': 'store_true',
'default': True}},
'no-follow-symlinks': {'options': {'action': 'store_false',
'dest': 'follow_symlinks',
'default': True}},
'no-guess-mime-type': {'options': {'action': 'store_false',
'dest': 'guess_mime_type',
'default': True}},
Expand Down
Loading