-
Notifications
You must be signed in to change notification settings - Fork 4.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implemented follow-symlinks feature for S3. #854
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,7 +31,7 @@ class FileDecodingError(Exception): | |
ADVICE = ( | ||
"Please check your locale settings. The filename was decoded as: %s\n" | ||
"On posix platforms, check the LC_CTYPE environment variable." | ||
% (sys.getfilesystemencoding()) | ||
% (sys.getfilesystemencoding()) | ||
) | ||
|
||
def __init__(self, directory, filename): | ||
|
@@ -58,6 +58,7 @@ def __init__(self, service, endpoint, operation_name, parameters): | |
self._service = service | ||
self._endpoint = endpoint | ||
self.operation_name = operation_name | ||
self.parameters = parameters | ||
|
||
def call(self, files): | ||
""" | ||
|
@@ -103,38 +104,42 @@ def list_files(self, path, dir_op): | |
""" | ||
join, isdir, isfile = os.path.join, os.path.isdir, os.path.isfile | ||
error, listdir = os.error, os.listdir | ||
if not dir_op: | ||
size, last_update = get_file_stat(path) | ||
yield path, size, last_update | ||
else: | ||
# We need to list files in byte order based on the full | ||
# expanded path of the key: 'test/1/2/3.txt' However, listdir() | ||
# will only give us contents a single directory at a time, so we'll | ||
# get 'test'. At the same time we don't want to load the entire | ||
# list of files into memory. This is handled by first going | ||
# through the current directory contents and adding the directory | ||
# separator to any directories. We can then sort the contents, | ||
# and ensure byte order. | ||
names = listdir(path) | ||
self._check_paths_decoded(path, names) | ||
for i, name in enumerate(names): | ||
file_path = join(path, name) | ||
if isdir(file_path): | ||
names[i] = name + os.path.sep | ||
names.sort() | ||
for name in names: | ||
file_path = join(path, name) | ||
if isdir(file_path): | ||
# Anything in a directory will have a prefix of this | ||
# current directory and will come before the | ||
# remaining contents in this directory. This means we need | ||
# to recurse into this sub directory before yielding the | ||
# rest of this directory's contents. | ||
for x in self.list_files(file_path, dir_op): | ||
yield x | ||
else: | ||
size, last_update = get_file_stat(file_path) | ||
yield file_path, size, last_update | ||
if not self.check_ignore_file(path): | ||
if not dir_op: | ||
size, last_update = get_file_stat(path) | ||
yield path, size, last_update | ||
else: | ||
# We need to list files in byte order based on the full | ||
# expanded path of the key: 'test/1/2/3.txt' However, | ||
# listdir() will only give us contents a single directory | ||
# at a time, so we'll get 'test'. At the same time we don't | ||
# want to load the entire list of files into memory. This | ||
# is handled by first going through the current directory | ||
# contents and adding the directory separator to any | ||
# directories. We can then sort the contents, | ||
# and ensure byte order. | ||
names = listdir(path) | ||
self._check_paths_decoded(path, names) | ||
for i, name in enumerate(names): | ||
file_path = join(path, name) | ||
if isdir(file_path): | ||
names[i] = name + os.path.sep | ||
names.sort() | ||
for name in names: | ||
file_path = join(path, name) | ||
if not self.check_ignore_file(file_path): | ||
if isdir(file_path): | ||
# Anything in a directory will have a prefix of | ||
# this current directory and will come before the | ||
# remaining contents in this directory. This | ||
# means we need to recurse into this sub directory | ||
# before yielding the rest of this directory's | ||
# contents. | ||
for x in self.list_files(file_path, dir_op): | ||
yield x | ||
else: | ||
size, last_update = get_file_stat(file_path) | ||
yield file_path, size, last_update | ||
|
||
def _check_paths_decoded(self, path, names): | ||
# We can get a UnicodeDecodeError if we try to listdir(<unicode>) and | ||
|
@@ -147,6 +152,23 @@ def _check_paths_decoded(self, path, names): | |
if not isinstance(name, six.text_type): | ||
raise FileDecodingError(path, name) | ||
|
||
def check_ignore_file(self, path): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. given this is a function that returns true/false, a name like |
||
""" | ||
This function checks whether a file should be ignored in the | ||
file generation process. This include files that do not exists | ||
(i.e. broken symlinks) and symlinks that are not to be followed. | ||
""" | ||
if not os.path.exists(path): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an interesting case. I would expect that if a user asks us to follow symlinks and we encounter a symlink that's bad, we should be warning the user about this and return with a non-zero RC because not everything the customer asked for was synced. In the simplest scenario, trying to copy a single file bad symlink seems counterintuitive:
What do you think? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I have been thinking about this as well. The current implementation skips broken symbolic links via the |
||
return True | ||
follow_symlinks = self.parameters.get('follow_symlinks', True) | ||
if not follow_symlinks: | ||
if os.path.isdir(path): | ||
# Trailing slash must be removed to check if it is a symlink. | ||
path = path[:-1] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure if we can always count on the invariant that os.path.isdir(path) will always end with a trailing slash. I think it would be safer to also check this condition: |
||
if os.path.islink(path): | ||
return True | ||
return False | ||
|
||
def list_objects(self, s3_path, dir_op): | ||
""" | ||
This function yields the appropriate object or objects under a | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Re-wording suggestion: