From 181c1c8eb970ae11a707b6b6c3d1e4d546504ccf Mon Sep 17 00:00:00 2001 From: mefyl Date: Fri, 16 Feb 2018 11:03:35 +0100 Subject: [PATCH 1/5] Revert "Correctly support absolute paths in .dockerignore" This reverts commit 34d50483e20e86cb7ab22700e036a5c4d319268a. Signed-off-by: mefyl --- docker/utils/build.py | 20 +++++++------------- tests/unit/utils_test.py | 27 ++++++++++----------------- 2 files changed, 17 insertions(+), 30 deletions(-) diff --git a/docker/utils/build.py b/docker/utils/build.py index a2188734..d4223e74 100644 --- a/docker/utils/build.py +++ b/docker/utils/build.py @@ -46,7 +46,7 @@ def exclude_paths(root, patterns, dockerfile=None): ) -def should_include(path, exclude_patterns, include_patterns, root): +def should_include(path, exclude_patterns, include_patterns): """ Given a path, a list of exclude patterns, and a list of inclusion patterns: @@ -61,15 +61,11 @@ def should_include(path, exclude_patterns, include_patterns, root): for pattern in include_patterns: if match_path(path, pattern): return True - if os.path.isabs(pattern) and match_path( - os.path.join(root, path), pattern): - return True return False return True -def should_check_directory(directory_path, exclude_patterns, include_patterns, - root): +def should_check_directory(directory_path, exclude_patterns, include_patterns): """ Given a directory path, a list of exclude patterns, and a list of inclusion patterns: @@ -95,7 +91,7 @@ def should_check_directory(directory_path, exclude_patterns, include_patterns, if (pattern + '/').startswith(path_with_slash) ] directory_included = should_include( - directory_path, exclude_patterns, include_patterns, root + directory_path, exclude_patterns, include_patterns ) return directory_included or len(possible_child_patterns) > 0 @@ -114,28 +110,26 @@ def get_paths(root, exclude_patterns, include_patterns, has_exceptions=False): # traversal. See https://docs.python.org/2/library/os.html#os.walk dirs[:] = [ d for d in dirs if should_check_directory( - os.path.join(parent, d), exclude_patterns, include_patterns, - root + os.path.join(parent, d), exclude_patterns, include_patterns ) ] for path in dirs: if should_include(os.path.join(parent, path), - exclude_patterns, include_patterns, root): + exclude_patterns, include_patterns): paths.append(os.path.join(parent, path)) for path in files: if should_include(os.path.join(parent, path), - exclude_patterns, include_patterns, root): + exclude_patterns, include_patterns): paths.append(os.path.join(parent, path)) return paths def match_path(path, pattern): - pattern = pattern.rstrip('/' + os.path.sep) - if pattern and not os.path.isabs(pattern): + if pattern: pattern = os.path.relpath(pattern) pattern_components = pattern.split(os.path.sep) diff --git a/tests/unit/utils_test.py b/tests/unit/utils_test.py index e144b7b1..eedcf71a 100644 --- a/tests/unit/utils_test.py +++ b/tests/unit/utils_test.py @@ -876,13 +876,6 @@ class ExcludePathsTest(unittest.TestCase): ) ) - def test_exclude_include_absolute_path(self): - base = make_tree([], ['a.py', 'b.py']) - assert exclude_paths( - base, - ['/*', '!' + os.path.join(base, '*.py')] - ) == set(['a.py', 'b.py']) - class TarTest(unittest.TestCase): def test_tar_with_excludes(self): @@ -1033,52 +1026,52 @@ class ShouldCheckDirectoryTest(unittest.TestCase): def test_should_check_directory_not_excluded(self): assert should_check_directory( - 'not_excluded', self.exclude_patterns, self.include_patterns, '.' + 'not_excluded', self.exclude_patterns, self.include_patterns ) assert should_check_directory( convert_path('dir/with'), self.exclude_patterns, - self.include_patterns, '.' + self.include_patterns ) def test_shoud_check_parent_directories_of_excluded(self): assert should_check_directory( - 'dir', self.exclude_patterns, self.include_patterns, '.' + 'dir', self.exclude_patterns, self.include_patterns ) assert should_check_directory( convert_path('dir/with'), self.exclude_patterns, - self.include_patterns, '.' + self.include_patterns ) def test_should_not_check_excluded_directories_with_no_exceptions(self): assert not should_check_directory( 'exclude_rather_large_directory', self.exclude_patterns, - self.include_patterns, '.' + self.include_patterns ) assert not should_check_directory( convert_path('dir/with/subdir_excluded'), self.exclude_patterns, - self.include_patterns, '.' + self.include_patterns ) def test_should_check_excluded_directory_with_exceptions(self): assert should_check_directory( convert_path('dir/with/exceptions'), self.exclude_patterns, - self.include_patterns, '.' + self.include_patterns ) assert should_check_directory( convert_path('dir/with/exceptions/in'), self.exclude_patterns, - self.include_patterns, '.' + self.include_patterns ) def test_should_not_check_siblings_of_exceptions(self): assert not should_check_directory( convert_path('dir/with/exceptions/but_not_here'), - self.exclude_patterns, self.include_patterns, '.' + self.exclude_patterns, self.include_patterns ) def test_should_check_subdirectories_of_exceptions(self): assert should_check_directory( convert_path('dir/with/exceptions/like_this_one/subdir'), - self.exclude_patterns, self.include_patterns, '.' + self.exclude_patterns, self.include_patterns ) From c8f5a5ad4040560ce62d53002ecec12485b531f7 Mon Sep 17 00:00:00 2001 From: mefyl Date: Fri, 16 Feb 2018 11:22:29 +0100 Subject: [PATCH 2/5] Fix dockerignore handling of absolute path exceptions. Signed-off-by: mefyl --- docker/utils/build.py | 6 +++--- tests/unit/utils_test.py | 7 +++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/docker/utils/build.py b/docker/utils/build.py index d4223e74..e86a04e6 100644 --- a/docker/utils/build.py +++ b/docker/utils/build.py @@ -26,13 +26,13 @@ def exclude_paths(root, patterns, dockerfile=None): if dockerfile is None: dockerfile = 'Dockerfile' - patterns = [p.lstrip('/') for p in patterns] exceptions = [p for p in patterns if p.startswith('!')] - include_patterns = [p[1:] for p in exceptions] + include_patterns = [p[1:].lstrip('/') for p in exceptions] include_patterns += [dockerfile, '.dockerignore'] - exclude_patterns = list(set(patterns) - set(exceptions)) + exclude_patterns = [ + p.lstrip('/') for p in list(set(patterns) - set(exceptions))] paths = get_paths(root, exclude_patterns, include_patterns, has_exceptions=len(exceptions) > 0) diff --git a/tests/unit/utils_test.py b/tests/unit/utils_test.py index eedcf71a..0ee041ae 100644 --- a/tests/unit/utils_test.py +++ b/tests/unit/utils_test.py @@ -758,6 +758,13 @@ class ExcludePathsTest(unittest.TestCase): self.all_paths - set(['foo/a.py']) ) + def test_exclude_include_absolute_path(self): + base = make_tree([], ['a.py', 'b.py']) + assert exclude_paths( + base, + ['/*', '!/*.py'] + ) == set(['a.py', 'b.py']) + def test_single_subdir_with_path_traversal(self): assert self.exclude(['foo/whoops/../a.py']) == convert_paths( self.all_paths - set(['foo/a.py']) From bb3ad64060b1f1136a6a42ed4d2018f71ebd371d Mon Sep 17 00:00:00 2001 From: mefyl Date: Fri, 16 Feb 2018 16:08:11 +0100 Subject: [PATCH 3/5] Fix .dockerignore: accept wildcard in inclusion pattern, honor last line precedence. Signed-off-by: mefyl --- docker/utils/build.py | 180 +++++++++++++++------------------------ tests/unit/utils_test.py | 84 +++++------------- 2 files changed, 90 insertions(+), 174 deletions(-) diff --git a/docker/utils/build.py b/docker/utils/build.py index e86a04e6..bfdb87b2 100644 --- a/docker/utils/build.py +++ b/docker/utils/build.py @@ -1,20 +1,24 @@ import os +import re from ..constants import IS_WINDOWS_PLATFORM -from .fnmatch import fnmatch +from fnmatch import fnmatch +from itertools import chain from .utils import create_archive def tar(path, exclude=None, dockerfile=None, fileobj=None, gzip=False): root = os.path.abspath(path) exclude = exclude or [] - return create_archive( files=sorted(exclude_paths(root, exclude, dockerfile=dockerfile)), root=root, fileobj=fileobj, gzip=gzip ) +_SEP = re.compile('/|\\\\') if IS_WINDOWS_PLATFORM else re.compile('/') + + def exclude_paths(root, patterns, dockerfile=None): """ Given a root directory path and a list of .dockerignore patterns, return @@ -23,121 +27,77 @@ def exclude_paths(root, patterns, dockerfile=None): All paths returned are relative to the root. """ + if dockerfile is None: dockerfile = 'Dockerfile' - exceptions = [p for p in patterns if p.startswith('!')] + def normalize(p): + # Leading and trailing slashes are not relevant. Yes, + # "foo.py/" must exclude the "foo.py" regular file. "." + # components are not relevant either, even if the whole + # pattern is only ".", as the Docker reference states: "For + # historical reasons, the pattern . is ignored." + split = [pt for pt in re.split(_SEP, p) if pt and pt != '.'] + # ".." component must be cleared with the potential previous + # component, regardless of whether it exists: "A preprocessing + # step [...] eliminates . and .. elements using Go's + # filepath.". + i = 0 + while i < len(split): + if split[i] == '..': + del split[i] + if i > 0: + del split[i - 1] + i -= 1 + else: + i += 1 + return split - include_patterns = [p[1:].lstrip('/') for p in exceptions] - include_patterns += [dockerfile, '.dockerignore'] - - exclude_patterns = [ - p.lstrip('/') for p in list(set(patterns) - set(exceptions))] - - paths = get_paths(root, exclude_patterns, include_patterns, - has_exceptions=len(exceptions) > 0) - - return set(paths).union( - # If the Dockerfile is in a subdirectory that is excluded, get_paths - # will not descend into it and the file will be skipped. This ensures - # it doesn't happen. - set([dockerfile.replace('/', os.path.sep)]) - if os.path.exists(os.path.join(root, dockerfile)) else set() - ) + patterns = ( + (True, normalize(p[1:])) + if p.startswith('!') else + (False, normalize(p)) + for p in patterns) + patterns = list(reversed(list(chain( + # Exclude empty patterns such as "." or the empty string. + filter(lambda p: p[1], patterns), + # Always include the Dockerfile and .dockerignore + [(True, dockerfile.split('/')), (True, ['.dockerignore'])])))) + return set(walk(root, patterns)) -def should_include(path, exclude_patterns, include_patterns): +def walk(root, patterns, default=True): """ - Given a path, a list of exclude patterns, and a list of inclusion patterns: - - 1. Returns True if the path doesn't match any exclusion pattern - 2. Returns False if the path matches an exclusion pattern and doesn't match - an inclusion pattern - 3. Returns true if the path matches an exclusion pattern and matches an - inclusion pattern - """ - for pattern in exclude_patterns: - if match_path(path, pattern): - for pattern in include_patterns: - if match_path(path, pattern): - return True - return False - return True - - -def should_check_directory(directory_path, exclude_patterns, include_patterns): - """ - Given a directory path, a list of exclude patterns, and a list of inclusion - patterns: - - 1. Returns True if the directory path should be included according to - should_include. - 2. Returns True if the directory path is the prefix for an inclusion - pattern - 3. Returns False otherwise + A collection of file lying below root that should be included according to + patterns. """ - # To account for exception rules, check directories if their path is a - # a prefix to an inclusion pattern. This logic conforms with the current - # docker logic (2016-10-27): - # https://github.com/docker/docker/blob/bc52939b0455116ab8e0da67869ec81c1a1c3e2c/pkg/archive/archive.go#L640-L671 + def match(p): + if p[1][0] == '**': + rec = (p[0], p[1][1:]) + return [p] + (match(rec) if rec[1] else [rec]) + elif fnmatch(f, p[1][0]): + return [(p[0], p[1][1:])] + else: + return [] - def normalize_path(path): - return path.replace(os.path.sep, '/') - - path_with_slash = normalize_path(directory_path) + '/' - possible_child_patterns = [ - pattern for pattern in map(normalize_path, include_patterns) - if (pattern + '/').startswith(path_with_slash) - ] - directory_included = should_include( - directory_path, exclude_patterns, include_patterns - ) - return directory_included or len(possible_child_patterns) > 0 - - -def get_paths(root, exclude_patterns, include_patterns, has_exceptions=False): - paths = [] - - for parent, dirs, files in os.walk(root, topdown=True, followlinks=False): - parent = os.path.relpath(parent, root) - if parent == '.': - parent = '' - - # Remove excluded patterns from the list of directories to traverse - # by mutating the dirs we're iterating over. - # This looks strange, but is considered the correct way to skip - # traversal. See https://docs.python.org/2/library/os.html#os.walk - dirs[:] = [ - d for d in dirs if should_check_directory( - os.path.join(parent, d), exclude_patterns, include_patterns - ) - ] - - for path in dirs: - if should_include(os.path.join(parent, path), - exclude_patterns, include_patterns): - paths.append(os.path.join(parent, path)) - - for path in files: - if should_include(os.path.join(parent, path), - exclude_patterns, include_patterns): - paths.append(os.path.join(parent, path)) - - return paths - - -def match_path(path, pattern): - pattern = pattern.rstrip('/' + os.path.sep) - if pattern: - pattern = os.path.relpath(pattern) - - pattern_components = pattern.split(os.path.sep) - if len(pattern_components) == 1 and IS_WINDOWS_PLATFORM: - pattern_components = pattern.split('/') - - if '**' not in pattern: - path_components = path.split(os.path.sep)[:len(pattern_components)] - else: - path_components = path.split(os.path.sep) - return fnmatch('/'.join(path_components), '/'.join(pattern_components)) + for f in os.listdir(root): + cur = os.path.join(root, f) + # The patterns if recursing in that directory. + sub = list(chain(*(match(p) for p in patterns))) + # Whether this file is explicitely included / excluded. + hit = next((p[0] for p in sub if not p[1]), None) + # Whether this file is implicitely included / excluded. + matched = default if hit is None else hit + sub = list(filter(lambda p: p[1], sub)) + if os.path.isdir(cur): + children = False + for r in (os.path.join(f, p) for p in walk(cur, sub, matched)): + yield r + children = True + # The current unit tests expect directories only under those + # conditions. It might be simplifiable though. + if (not sub or not children) and hit or hit is None and default: + yield f + elif matched: + yield f diff --git a/tests/unit/utils_test.py b/tests/unit/utils_test.py index 0ee041ae..8a4b1937 100644 --- a/tests/unit/utils_test.py +++ b/tests/unit/utils_test.py @@ -23,7 +23,6 @@ from docker.utils import ( decode_json_header, tar, split_command, parse_devices, update_headers, ) -from docker.utils.build import should_check_directory from docker.utils.ports import build_port_bindings, split_port from docker.utils.utils import format_environment @@ -883,6 +882,26 @@ class ExcludePathsTest(unittest.TestCase): ) ) + def test_include_wildcard(self): + base = make_tree(['a'], ['a/b.py']) + assert exclude_paths( + base, + ['*', '!*/b.py'] + ) == convert_paths(['a/b.py']) + + def test_last_line_precedence(self): + base = make_tree( + [], + ['garbage.md', + 'thrash.md', + 'README.md', + 'README-bis.md', + 'README-secret.md']) + assert exclude_paths( + base, + ['*.md', '!README*.md', 'README-secret.md'] + ) == set(['README.md', 'README-bis.md']) + class TarTest(unittest.TestCase): def test_tar_with_excludes(self): @@ -1019,69 +1038,6 @@ class TarTest(unittest.TestCase): assert tar_data.getmember('th.txt').mtime == -3600 -class ShouldCheckDirectoryTest(unittest.TestCase): - exclude_patterns = [ - 'exclude_rather_large_directory', - 'dir/with/subdir_excluded', - 'dir/with/exceptions' - ] - - include_patterns = [ - 'dir/with/exceptions/like_this_one', - 'dir/with/exceptions/in/descendents' - ] - - def test_should_check_directory_not_excluded(self): - assert should_check_directory( - 'not_excluded', self.exclude_patterns, self.include_patterns - ) - assert should_check_directory( - convert_path('dir/with'), self.exclude_patterns, - self.include_patterns - ) - - def test_shoud_check_parent_directories_of_excluded(self): - assert should_check_directory( - 'dir', self.exclude_patterns, self.include_patterns - ) - assert should_check_directory( - convert_path('dir/with'), self.exclude_patterns, - self.include_patterns - ) - - def test_should_not_check_excluded_directories_with_no_exceptions(self): - assert not should_check_directory( - 'exclude_rather_large_directory', self.exclude_patterns, - self.include_patterns - ) - assert not should_check_directory( - convert_path('dir/with/subdir_excluded'), self.exclude_patterns, - self.include_patterns - ) - - def test_should_check_excluded_directory_with_exceptions(self): - assert should_check_directory( - convert_path('dir/with/exceptions'), self.exclude_patterns, - self.include_patterns - ) - assert should_check_directory( - convert_path('dir/with/exceptions/in'), self.exclude_patterns, - self.include_patterns - ) - - def test_should_not_check_siblings_of_exceptions(self): - assert not should_check_directory( - convert_path('dir/with/exceptions/but_not_here'), - self.exclude_patterns, self.include_patterns - ) - - def test_should_check_subdirectories_of_exceptions(self): - assert should_check_directory( - convert_path('dir/with/exceptions/like_this_one/subdir'), - self.exclude_patterns, self.include_patterns - ) - - class FormatEnvironmentTest(unittest.TestCase): def test_format_env_binary_unicode_value(self): env_dict = { From 3b464f983e77cf85d9bc91e6f725cd7773bc0871 Mon Sep 17 00:00:00 2001 From: mefyl Date: Mon, 19 Feb 2018 12:48:17 +0100 Subject: [PATCH 4/5] Skip entirely excluded directories when handling dockerignore. This is pure optimization to not recurse into directories when there are no chances any file will be included. Signed-off-by: mefyl --- docker/utils/build.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docker/utils/build.py b/docker/utils/build.py index bfdb87b2..09b20717 100644 --- a/docker/utils/build.py +++ b/docker/utils/build.py @@ -91,6 +91,10 @@ def walk(root, patterns, default=True): matched = default if hit is None else hit sub = list(filter(lambda p: p[1], sub)) if os.path.isdir(cur): + # Entirely skip directories if there are no chance any subfile will + # be included. + if all(not p[0] for p in sub) and not matched: + continue children = False for r in (os.path.join(f, p) for p in walk(cur, sub, matched)): yield r From 0c948c7df65b0d7378c3c0c8d966c38171f1ef21 Mon Sep 17 00:00:00 2001 From: mefyl Date: Mon, 19 Feb 2018 12:54:04 +0100 Subject: [PATCH 5/5] Add note about potential dockerignore optimization. Signed-off-by: mefyl --- docker/utils/build.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docker/utils/build.py b/docker/utils/build.py index 09b20717..1da56fbc 100644 --- a/docker/utils/build.py +++ b/docker/utils/build.py @@ -95,6 +95,15 @@ def walk(root, patterns, default=True): # be included. if all(not p[0] for p in sub) and not matched: continue + # I think this would greatly speed up dockerignore handling by not + # recursing into directories we are sure would be entirely + # included, and only yielding the directory itself, which will be + # recursively archived anyway. However the current unit test expect + # the full list of subfiles and I'm not 100% sure it would make no + # difference yet. + # if all(p[0] for p in sub) and matched: + # yield f + # continue children = False for r in (os.path.join(f, p) for p in walk(cur, sub, matched)): yield r