From 55eb815a6620d98c21ea0df089c7f0c3e72dc748 Mon Sep 17 00:00:00 2001 From: Hugues Bruant Date: Thu, 20 Jan 2022 10:27:33 -0800 Subject: [PATCH] FindModuleCache: optionally leverage BuildSourceSet Gated behind a command line flag to assuage concerns about subtle issues in module lookup being introduced by this fast path. --- mypy/build.py | 34 ++----------- mypy/main.py | 4 ++ mypy/modulefinder.py | 111 ++++++++++++++++++++++++++++++++++++++++++- mypy/options.py | 2 + 4 files changed, 119 insertions(+), 32 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index f7a9e9e05e1d3..ba62932a0a8f9 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -42,8 +42,8 @@ from mypy.report import Reports # Avoid unconditional slow import from mypy.fixup import fixup_module from mypy.modulefinder import ( - BuildSource, compute_search_paths, FindModuleCache, SearchPaths, ModuleSearchResult, - ModuleNotFoundReason + BuildSource, BuildSourceSet, compute_search_paths, FindModuleCache, SearchPaths, + ModuleSearchResult, ModuleNotFoundReason ) from mypy.nodes import Expression from mypy.options import Options @@ -106,33 +106,6 @@ def __init__(self, manager: 'BuildManager', graph: Graph) -> None: self.errors: List[str] = [] # Filled in by build if desired -class BuildSourceSet: - """Efficiently test a file's membership in the set of build sources.""" - - def __init__(self, sources: List[BuildSource]) -> None: - self.source_text_present = False - self.source_modules: Set[str] = set() - self.source_paths: Set[str] = set() - - for source in sources: - if source.text is not None: - self.source_text_present = True - elif source.path: - self.source_paths.add(source.path) - else: - self.source_modules.add(source.module) - - def is_source(self, file: MypyFile) -> bool: - if file.path and file.path in self.source_paths: - return True - elif file._fullname in self.source_modules: - return True - elif self.source_text_present: - return True - else: - return False - - def build(sources: List[BuildSource], options: Options, alt_lib_path: Optional[str] = None, @@ -627,7 +600,8 @@ def __init__(self, data_dir: str, or options.use_fine_grained_cache) and not has_reporters) self.fscache = fscache - self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options) + self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options, + source_set=self.source_set) self.metastore = create_metastore(options) # a mapping from source files to their corresponding shadow files diff --git a/mypy/main.py b/mypy/main.py index c4548ea9b6250..bb1c331abe8ee 100644 --- a/mypy/main.py +++ b/mypy/main.py @@ -878,6 +878,10 @@ def add_invertible_flag(flag: str, '--explicit-package-bases', default=False, help="Use current directory and MYPYPATH to determine module names of files passed", group=code_group) + add_invertible_flag( + '--fast-module-lookup', default=False, + help="Enable fast path for finding modules within input sources", + group=code_group) code_group.add_argument( "--exclude", action="append", diff --git a/mypy/modulefinder.py b/mypy/modulefinder.py index 94d2dd34c16e7..52448dad76cf9 100644 --- a/mypy/modulefinder.py +++ b/mypy/modulefinder.py @@ -21,6 +21,7 @@ from typing_extensions import Final, TypeAlias as _TypeAlias from mypy.fscache import FileSystemCache +from mypy.nodes import MypyFile from mypy.options import Options from mypy.stubinfo import is_legacy_bundled_package from mypy import pyinfo @@ -120,6 +121,33 @@ def __repr__(self) -> str: self.base_dir) +class BuildSourceSet: + """Helper to efficiently test a file's membership in a set of build sources.""" + + def __init__(self, sources: List[BuildSource]) -> None: + self.source_text_present = False + self.source_modules = {} # type: Dict[str, str] + self.source_paths = set() # type: Set[str] + + for source in sources: + if source.text is not None: + self.source_text_present = True + if source.path: + self.source_paths.add(source.path) + if source.module: + self.source_modules[source.module] = source.path or '' + + def is_source(self, file: MypyFile) -> bool: + if file.path and file.path in self.source_paths: + return True + elif file._fullname in self.source_modules: + return True + elif self.source_text_present: + return True + else: + return False + + class FindModuleCache: """Module finder with integrated cache. @@ -135,8 +163,10 @@ def __init__(self, search_paths: SearchPaths, fscache: Optional[FileSystemCache], options: Optional[Options], - stdlib_py_versions: Optional[StdlibVersions] = None) -> None: + stdlib_py_versions: Optional[StdlibVersions] = None, + source_set: Optional[BuildSourceSet] = None) -> None: self.search_paths = search_paths + self.source_set = source_set self.fscache = fscache or FileSystemCache() # Cache for get_toplevel_possibilities: # search_paths -> (toplevel_id -> list(package_dirs)) @@ -158,6 +188,50 @@ def clear(self) -> None: self.initial_components.clear() self.ns_ancestors.clear() + def find_module_via_source_set(self, id: str) -> Optional[ModuleSearchResult]: + if not self.source_set: + return None + + p = self.source_set.source_modules.get(id, None) + if p and self.fscache.isfile(p): + # We need to make sure we still have __init__.py all the way up + # otherwise we might have false positives compared to slow path + # in case of deletion of init files, which is covered by some tests. + # TODO: are there some combination of flags in which this check should be skipped? + d = os.path.dirname(p) + for _ in range(id.count('.')): + if not any(self.fscache.isfile(os.path.join(d, '__init__' + x)) + for x in PYTHON_EXTENSIONS): + return None + d = os.path.dirname(d) + return p + + idx = id.rfind('.') + if idx != -1: + # When we're looking for foo.bar.baz and can't find a matching module + # in the source set, look up for a foo.bar module. + parent = self.find_module_via_source_set(id[:idx]) + if parent is None or not isinstance(parent, str): + return None + + basename, ext = os.path.splitext(parent) + if (not any(parent.endswith('__init__' + x) for x in PYTHON_EXTENSIONS) + and (ext in PYTHON_EXTENSIONS and not self.fscache.isdir(basename))): + # If we do find such a *module* (and crucially, we don't want a package, + # hence the filtering out of __init__ files, and checking for the presence + # of a folder with a matching name), then we can be pretty confident that + # 'baz' will either be a top-level variable in foo.bar, or will not exist. + # + # Either way, spelunking in other search paths for another 'foo.bar.baz' + # module should be avoided because: + # 1. in the unlikely event that one were found, it's highly likely that + # it would be unrelated to the source being typechecked and therefore + # more likely to lead to erroneous results + # 2. as described in _find_module, in some cases the search itself could + # potentially waste significant amounts of time + return ModuleNotFoundReason.NOT_FOUND + return None + def find_lib_path_dirs(self, id: str, lib_path: Tuple[str, ...]) -> PackageDirs: """Find which elements of a lib_path have the directory a module needs to exist. @@ -223,7 +297,7 @@ def find_module(self, id: str, *, fast_path: bool = False) -> ModuleSearchResult elif top_level in self.stdlib_py_versions: use_typeshed = self._typeshed_has_version(top_level) self.results[id] = self._find_module(id, use_typeshed) - if (not fast_path + if (not (fast_path or (self.options is not None and self.options.fast_module_lookup)) and self.results[id] is ModuleNotFoundReason.NOT_FOUND and self._can_find_module_in_parent_dir(id)): self.results[id] = ModuleNotFoundReason.WRONG_WORKING_DIRECTORY @@ -289,6 +363,39 @@ def _can_find_module_in_parent_dir(self, id: str) -> bool: def _find_module(self, id: str, use_typeshed: bool) -> ModuleSearchResult: fscache = self.fscache + # Fast path for any modules in the current source set. + # This is particularly important when there are a large number of search + # paths which share the first (few) component(s) due to the use of namespace + # packages, for instance: + # foo/ + # company/ + # __init__.py + # foo/ + # bar/ + # company/ + # __init__.py + # bar/ + # baz/ + # company/ + # __init__.py + # baz/ + # + # mypy gets [foo/company/foo, bar/company/bar, baz/company/baz, ...] as input + # and computes [foo, bar, baz, ...] as the module search path. + # + # This would result in O(n) search for every import of company.*, leading to + # O(n**2) behavior in load_graph as such imports are unsurprisingly present + # at least once, and usually many more times than that, in each and every file + # being parsed. + # + # Thankfully, such cases are efficiently handled by looking up the module path + # via BuildSourceSet. + p = (self.find_module_via_source_set(id) + if (self.options is not None and self.options.fast_module_lookup) + else None) + if p: + return p + # If we're looking for a module like 'foo.bar.baz', it's likely that most of the # many elements of lib_path don't even have a subdirectory 'foo/bar'. Discover # that only once and cache it for when we look for modules like 'foo.bar.blah' diff --git a/mypy/options.py b/mypy/options.py index 8e56d67bbeb8b..29a72bebf186c 100644 --- a/mypy/options.py +++ b/mypy/options.py @@ -292,6 +292,8 @@ def __init__(self) -> None: self.cache_map: Dict[str, Tuple[str, str]] = {} # Don't properly free objects on exit, just kill the current process. self.fast_exit = True + # fast path for finding modules from source set + self.fast_module_lookup = False # Used to transform source code before parsing if not None # TODO: Make the type precise (AnyStr -> AnyStr) self.transform_source: Optional[Callable[[Any], Any]] = None