Skip to content

Commit

Permalink
Speed up processing of new files in daemon by caching ASTs (#10128)
Browse files Browse the repository at this point in the history
Processing newly installed stub files, in particular, could be quite slow incrementally
in mypy daemon. This is because adding N files results in N steps interally, each of 
which adds one file. However, each step parses all remaining files, resulting in 
an O(n**2) algorithm.

For example, processing `six` stubs could take about 40s (when not using a 
compiled mypy).

Partially address the issue by caching parsed ASTs during a single increment.
This speeds up the `import six` use case by about 3x when not using a compiled
mypy. It's still about 3x slower when using daemon, however.
  • Loading branch information
JukkaL authored Feb 22, 2021
1 parent 9cbf4c0 commit 4827f3a
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 13 deletions.
50 changes: 40 additions & 10 deletions mypy/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,7 @@ class BuildManager:
not only for debugging, but also required for correctness,
in particular to check consistency of the fine-grained dependency cache.
fscache: A file system cacher
ast_cache: AST cache to speed up mypy daemon
"""

def __init__(self, data_dir: str,
Expand Down Expand Up @@ -645,6 +646,14 @@ def __init__(self, data_dir: str,
self.processed_targets = [] # type: List[str]
# Missing stub packages encountered.
self.missing_stub_packages = set() # type: Set[str]
# Cache for mypy ASTs that have completed semantic analysis
# pass 1. When multiple files are added to the build in a
# single daemon increment, only one of the files gets added
# per step and the others are discarded. This gets repeated
# until all the files have been added. This means that a
# new file can be processed O(n**2) times. This cache
# avoids most of this redundant work.
self.ast_cache = {} # type: Dict[str, Tuple[MypyFile, List[ErrorInfo]]]

def dump_stats(self) -> None:
if self.options.dump_build_stats:
Expand Down Expand Up @@ -1994,8 +2003,14 @@ def parse_file(self) -> None:
return

manager = self.manager

# Can we reuse a previously parsed AST? This avoids redundant work in daemon.
cached = self.id in manager.ast_cache
modules = manager.modules
manager.log("Parsing %s (%s)" % (self.xpath, self.id))
if not cached:
manager.log("Parsing %s (%s)" % (self.xpath, self.id))
else:
manager.log("Using cached AST for %s (%s)" % (self.xpath, self.id))

with self.wrap_context():
source = self.source
Expand Down Expand Up @@ -2026,21 +2041,36 @@ def parse_file(self) -> None:
self.source_hash = compute_hash(source)

self.parse_inline_configuration(source)
self.tree = manager.parse_file(self.id, self.xpath, source,
self.ignore_all or self.options.ignore_errors,
self.options)
if not cached:
self.tree = manager.parse_file(self.id, self.xpath, source,
self.ignore_all or self.options.ignore_errors,
self.options)

modules[self.id] = self.tree
else:
# Reuse a cached AST
self.tree = manager.ast_cache[self.id][0]
manager.errors.set_file_ignored_lines(
self.xpath,
self.tree.ignored_lines,
self.ignore_all or self.options.ignore_errors)

if not cached:
# Make a copy of any errors produced during parse time so that
# fine-grained mode can repeat them when the module is
# reprocessed.
self.early_errors = list(manager.errors.error_info_map.get(self.xpath, []))
else:
self.early_errors = manager.ast_cache[self.id][1]

# Make a copy of any errors produced during parse time so that
# fine-grained mode can repeat them when the module is
# reprocessed.
self.early_errors = list(manager.errors.error_info_map.get(self.xpath, []))
modules[self.id] = self.tree

self.semantic_analysis_pass1()
if not cached:
self.semantic_analysis_pass1()

self.check_blockers()

manager.ast_cache[self.id] = (self.tree, self.early_errors)

def parse_inline_configuration(self, source: str) -> None:
"""Check for inline mypy: options directive and parse them."""
flags = get_mypy_comments(source)
Expand Down
11 changes: 8 additions & 3 deletions mypy/dmypy_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ def cmd_recheck(self,
assert remove is None and update is None
messages = self.fine_grained_increment_follow_imports(sources)
res = self.increment_output(messages, sources, is_tty, terminal_width)
self.fscache.flush()
self.flush_caches()
self.update_stats(res)
return res

Expand All @@ -392,10 +392,15 @@ def check(self, sources: List[BuildSource],
else:
messages = self.fine_grained_increment_follow_imports(sources)
res = self.increment_output(messages, sources, is_tty, terminal_width)
self.fscache.flush()
self.flush_caches()
self.update_stats(res)
return res

def flush_caches(self) -> None:
self.fscache.flush()
if self.fine_grained_manager:
self.fine_grained_manager.flush_cache()

def update_stats(self, res: Dict[str, Any]) -> None:
if self.fine_grained_manager:
manager = self.fine_grained_manager.manager
Expand Down Expand Up @@ -852,7 +857,7 @@ def cmd_suggest(self,
out += "\n"
return {'out': out, 'err': "", 'status': 0}
finally:
self.fscache.flush()
self.flush_caches()

def cmd_hang(self) -> Dict[str, object]:
"""Hang for 100 seconds, as a debug hack."""
Expand Down
8 changes: 8 additions & 0 deletions mypy/server/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,14 @@ def trigger(self, target: str) -> List[str]:
self.previous_messages = self.manager.errors.new_messages()[:]
return self.update(changed_modules, [])

def flush_cache(self) -> None:
"""Flush AST cache.
This needs to be called after each increment, or file changes won't
be detected reliably.
"""
self.manager.ast_cache.clear()

def update_one(self,
changed_modules: List[Tuple[str, str]],
initial_set: Set[str],
Expand Down
1 change: 1 addition & 0 deletions mypy/suggestions.py
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,7 @@ def reload(self, state: State, check_errors: bool = False) -> List[str]:
If check_errors is true, raise an exception if there are errors.
"""
assert state.path is not None
self.fgmanager.flush_cache()
return self.fgmanager.update([(state.id, state.path)], [])

def ensure_loaded(self, state: State, force: bool = False) -> MypyFile:
Expand Down
1 change: 1 addition & 0 deletions mypy/test/testmerge.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def build(self, source: str, testcase: DataDrivenTestCase) -> Optional[BuildResu
def build_increment(self, manager: FineGrainedBuildManager,
module_id: str, path: str) -> Tuple[MypyFile,
Dict[Expression, Type]]:
manager.flush_cache()
manager.update([(module_id, path)], [])
module = manager.manager.modules[module_id]
type_map = manager.graph[module_id].type_map()
Expand Down

0 comments on commit 4827f3a

Please sign in to comment.