Skip to content

Commit

Permalink
FindModuleCache: optionally leverage BuildSourceSet
Browse files Browse the repository at this point in the history
Gated behind a command line flag to assuage concerns about subtle
issues in module lookup being introduced by this fast path.
  • Loading branch information
hugues-aff committed Apr 18, 2022
1 parent f501cf6 commit 55eb815
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 32 deletions.
34 changes: 4 additions & 30 deletions mypy/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@
from mypy.report import Reports # Avoid unconditional slow import
from mypy.fixup import fixup_module
from mypy.modulefinder import (
BuildSource, compute_search_paths, FindModuleCache, SearchPaths, ModuleSearchResult,
ModuleNotFoundReason
BuildSource, BuildSourceSet, compute_search_paths, FindModuleCache, SearchPaths,
ModuleSearchResult, ModuleNotFoundReason
)
from mypy.nodes import Expression
from mypy.options import Options
Expand Down Expand Up @@ -106,33 +106,6 @@ def __init__(self, manager: 'BuildManager', graph: Graph) -> None:
self.errors: List[str] = [] # Filled in by build if desired


class BuildSourceSet:
"""Efficiently test a file's membership in the set of build sources."""

def __init__(self, sources: List[BuildSource]) -> None:
self.source_text_present = False
self.source_modules: Set[str] = set()
self.source_paths: Set[str] = set()

for source in sources:
if source.text is not None:
self.source_text_present = True
elif source.path:
self.source_paths.add(source.path)
else:
self.source_modules.add(source.module)

def is_source(self, file: MypyFile) -> bool:
if file.path and file.path in self.source_paths:
return True
elif file._fullname in self.source_modules:
return True
elif self.source_text_present:
return True
else:
return False


def build(sources: List[BuildSource],
options: Options,
alt_lib_path: Optional[str] = None,
Expand Down Expand Up @@ -627,7 +600,8 @@ def __init__(self, data_dir: str,
or options.use_fine_grained_cache)
and not has_reporters)
self.fscache = fscache
self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options)
self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options,
source_set=self.source_set)
self.metastore = create_metastore(options)

# a mapping from source files to their corresponding shadow files
Expand Down
4 changes: 4 additions & 0 deletions mypy/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -878,6 +878,10 @@ def add_invertible_flag(flag: str,
'--explicit-package-bases', default=False,
help="Use current directory and MYPYPATH to determine module names of files passed",
group=code_group)
add_invertible_flag(
'--fast-module-lookup', default=False,
help="Enable fast path for finding modules within input sources",
group=code_group)
code_group.add_argument(
"--exclude",
action="append",
Expand Down
111 changes: 109 additions & 2 deletions mypy/modulefinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from typing_extensions import Final, TypeAlias as _TypeAlias

from mypy.fscache import FileSystemCache
from mypy.nodes import MypyFile
from mypy.options import Options
from mypy.stubinfo import is_legacy_bundled_package
from mypy import pyinfo
Expand Down Expand Up @@ -120,6 +121,33 @@ def __repr__(self) -> str:
self.base_dir)


class BuildSourceSet:
"""Helper to efficiently test a file's membership in a set of build sources."""

def __init__(self, sources: List[BuildSource]) -> None:
self.source_text_present = False
self.source_modules = {} # type: Dict[str, str]
self.source_paths = set() # type: Set[str]

for source in sources:
if source.text is not None:
self.source_text_present = True
if source.path:
self.source_paths.add(source.path)
if source.module:
self.source_modules[source.module] = source.path or ''

def is_source(self, file: MypyFile) -> bool:
if file.path and file.path in self.source_paths:
return True
elif file._fullname in self.source_modules:
return True
elif self.source_text_present:
return True
else:
return False


class FindModuleCache:
"""Module finder with integrated cache.
Expand All @@ -135,8 +163,10 @@ def __init__(self,
search_paths: SearchPaths,
fscache: Optional[FileSystemCache],
options: Optional[Options],
stdlib_py_versions: Optional[StdlibVersions] = None) -> None:
stdlib_py_versions: Optional[StdlibVersions] = None,
source_set: Optional[BuildSourceSet] = None) -> None:
self.search_paths = search_paths
self.source_set = source_set
self.fscache = fscache or FileSystemCache()
# Cache for get_toplevel_possibilities:
# search_paths -> (toplevel_id -> list(package_dirs))
Expand All @@ -158,6 +188,50 @@ def clear(self) -> None:
self.initial_components.clear()
self.ns_ancestors.clear()

def find_module_via_source_set(self, id: str) -> Optional[ModuleSearchResult]:
if not self.source_set:
return None

p = self.source_set.source_modules.get(id, None)
if p and self.fscache.isfile(p):
# We need to make sure we still have __init__.py all the way up
# otherwise we might have false positives compared to slow path
# in case of deletion of init files, which is covered by some tests.
# TODO: are there some combination of flags in which this check should be skipped?
d = os.path.dirname(p)
for _ in range(id.count('.')):
if not any(self.fscache.isfile(os.path.join(d, '__init__' + x))
for x in PYTHON_EXTENSIONS):
return None
d = os.path.dirname(d)
return p

idx = id.rfind('.')
if idx != -1:
# When we're looking for foo.bar.baz and can't find a matching module
# in the source set, look up for a foo.bar module.
parent = self.find_module_via_source_set(id[:idx])
if parent is None or not isinstance(parent, str):
return None

basename, ext = os.path.splitext(parent)
if (not any(parent.endswith('__init__' + x) for x in PYTHON_EXTENSIONS)
and (ext in PYTHON_EXTENSIONS and not self.fscache.isdir(basename))):
# If we do find such a *module* (and crucially, we don't want a package,
# hence the filtering out of __init__ files, and checking for the presence
# of a folder with a matching name), then we can be pretty confident that
# 'baz' will either be a top-level variable in foo.bar, or will not exist.
#
# Either way, spelunking in other search paths for another 'foo.bar.baz'
# module should be avoided because:
# 1. in the unlikely event that one were found, it's highly likely that
# it would be unrelated to the source being typechecked and therefore
# more likely to lead to erroneous results
# 2. as described in _find_module, in some cases the search itself could
# potentially waste significant amounts of time
return ModuleNotFoundReason.NOT_FOUND
return None

def find_lib_path_dirs(self, id: str, lib_path: Tuple[str, ...]) -> PackageDirs:
"""Find which elements of a lib_path have the directory a module needs to exist.
Expand Down Expand Up @@ -223,7 +297,7 @@ def find_module(self, id: str, *, fast_path: bool = False) -> ModuleSearchResult
elif top_level in self.stdlib_py_versions:
use_typeshed = self._typeshed_has_version(top_level)
self.results[id] = self._find_module(id, use_typeshed)
if (not fast_path
if (not (fast_path or (self.options is not None and self.options.fast_module_lookup))
and self.results[id] is ModuleNotFoundReason.NOT_FOUND
and self._can_find_module_in_parent_dir(id)):
self.results[id] = ModuleNotFoundReason.WRONG_WORKING_DIRECTORY
Expand Down Expand Up @@ -289,6 +363,39 @@ def _can_find_module_in_parent_dir(self, id: str) -> bool:
def _find_module(self, id: str, use_typeshed: bool) -> ModuleSearchResult:
fscache = self.fscache

# Fast path for any modules in the current source set.
# This is particularly important when there are a large number of search
# paths which share the first (few) component(s) due to the use of namespace
# packages, for instance:
# foo/
# company/
# __init__.py
# foo/
# bar/
# company/
# __init__.py
# bar/
# baz/
# company/
# __init__.py
# baz/
#
# mypy gets [foo/company/foo, bar/company/bar, baz/company/baz, ...] as input
# and computes [foo, bar, baz, ...] as the module search path.
#
# This would result in O(n) search for every import of company.*, leading to
# O(n**2) behavior in load_graph as such imports are unsurprisingly present
# at least once, and usually many more times than that, in each and every file
# being parsed.
#
# Thankfully, such cases are efficiently handled by looking up the module path
# via BuildSourceSet.
p = (self.find_module_via_source_set(id)
if (self.options is not None and self.options.fast_module_lookup)
else None)
if p:
return p

# If we're looking for a module like 'foo.bar.baz', it's likely that most of the
# many elements of lib_path don't even have a subdirectory 'foo/bar'. Discover
# that only once and cache it for when we look for modules like 'foo.bar.blah'
Expand Down
2 changes: 2 additions & 0 deletions mypy/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,8 @@ def __init__(self) -> None:
self.cache_map: Dict[str, Tuple[str, str]] = {}
# Don't properly free objects on exit, just kill the current process.
self.fast_exit = True
# fast path for finding modules from source set
self.fast_module_lookup = False
# Used to transform source code before parsing if not None
# TODO: Make the type precise (AnyStr -> AnyStr)
self.transform_source: Optional[Callable[[Any], Any]] = None
Expand Down

0 comments on commit 55eb815

Please sign in to comment.