Skip to content

Commit

Permalink
Add pip resolver (#1697)
Browse files Browse the repository at this point in the history
## Changes
Add a pip resolver for later use by the `ImportResolver`. The resolver
makes a library available to the path look up by i) installing the
library and ii) augmenting the path lookup.

### Linked issues
Part of #1642
Follow up on #1694 

### Functionality 

- [ ] added relevant user documentation
- [ ] added new CLI command
- [ ] modified existing command: `databricks labs ucx ...`
- [ ] added a new workflow
- [x] modified existing workflow: `...`
- [ ] added a new table
- [ ] modified existing table: `...`

### Tests
<!-- How is this tested? Please see the checklist below and also
describe any other relevant tests -->

- [ ] manually tested
- [x] added unit tests
- [ ] added integration tests
- [ ] verified on staging environment (screenshot attached)
  • Loading branch information
JCZuurmond authored May 16, 2024
1 parent fa722f2 commit 08aed28
Show file tree
Hide file tree
Showing 17 changed files with 377 additions and 64 deletions.
9 changes: 7 additions & 2 deletions src/databricks/labs/ucx/contexts/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from databricks.labs.ucx.source_code.path_lookup import PathLookup
from databricks.labs.ucx.source_code.graph import DependencyResolver
from databricks.labs.ucx.source_code.whitelist import WhitelistResolver, Whitelist
from databricks.labs.ucx.source_code.site_packages import SitePackages
from databricks.labs.ucx.source_code.site_packages import PipResolver, SitePackages
from databricks.labs.ucx.source_code.languages import Languages
from databricks.labs.ucx.source_code.redash import Redash
from databricks.labs.ucx.workspace_access import generic, redash
Expand Down Expand Up @@ -350,6 +350,10 @@ def workspace_info(self):
def verify_has_metastore(self):
return VerifyHasMetastore(self.workspace_client)

@cached_property
def pip_resolver(self):
return PipResolver()

@cached_property
def notebook_loader(self) -> NotebookLoader:
return NotebookLoader()
Expand Down Expand Up @@ -392,7 +396,8 @@ def file_resolver(self):
@cached_property
def dependency_resolver(self):
import_resolvers = [self.file_resolver, self.whitelist_resolver]
return DependencyResolver(self.notebook_resolver, import_resolvers, self.path_lookup)
library_resolvers = [self.pip_resolver]
return DependencyResolver(library_resolvers, self.notebook_resolver, import_resolvers, self.path_lookup)

@cached_property
def workflow_linter(self):
Expand Down
52 changes: 50 additions & 2 deletions src/databricks/labs/ucx/source_code/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,16 @@ def dependency(self):
def path(self):
return self._dependency.path

def register_library(self, name: str) -> MaybeGraph:
def register_library(self, library: str) -> list[DependencyProblem]:
# TODO: use DistInfoResolver to load wheel/egg/pypi dependencies
# TODO: https://github.com/databrickslabs/ucx/issues/1642
# TODO: https://github.com/databrickslabs/ucx/issues/1643
# TODO: https://github.com/databrickslabs/ucx/issues/1640
return MaybeGraph(None, [DependencyProblem('not-yet-implemented', f'Library dependency: {name}')])
maybe = self._resolver.resolve_library(self.path_lookup, library)
if not maybe.dependency:
return maybe.problems
maybe_graph = self.register_dependency(maybe.dependency)
return maybe_graph.problems

def register_notebook(self, path: Path) -> list[DependencyProblem]:
maybe = self._resolver.resolve_notebook(self.path_lookup, path)
Expand Down Expand Up @@ -80,6 +84,7 @@ def register_dependency(self, dependency: Dependency) -> MaybeGraph:
child_graph = DependencyGraph(dependency, self, self._resolver, self._path_lookup)
self._dependencies[dependency] = child_graph
container = dependency.load(self.path_lookup)
# TODO: Return either (child) graph OR problems
if not container:
problem = DependencyProblem('dependency-register-failed', 'Failed to register dependency', dependency.path)
return MaybeGraph(child_graph, [problem])
Expand Down Expand Up @@ -247,6 +252,20 @@ def __repr__(self):
return f"<WrappingLoader source_container={self._source_container}>"


class BaseLibraryResolver(abc.ABC):

def __init__(self, next_resolver: BaseLibraryResolver | None):
self._next_resolver = next_resolver

@abc.abstractmethod
def with_next_resolver(self, resolver: BaseLibraryResolver) -> BaseLibraryResolver:
"""required to create a linked list of resolvers"""

def resolve_library(self, path_lookup: PathLookup, library: str) -> MaybeDependency:
assert self._next_resolver is not None
return self._next_resolver.resolve_library(path_lookup, library)


class BaseNotebookResolver(abc.ABC):

@abc.abstractmethod
Expand Down Expand Up @@ -284,6 +303,22 @@ def resolve_local_file(self, path_lookup, path: Path) -> MaybeDependency:
"""locates a file"""


class StubLibraryResolver(BaseLibraryResolver):

def __init__(self):
super().__init__(None)

def with_next_resolver(self, resolver: BaseLibraryResolver) -> BaseLibraryResolver:
raise NotImplementedError("Should never happen!")

def resolve_library(self, path_lookup: PathLookup, library: str) -> MaybeDependency:
return self._fail('library-not-found', f"Could not resolve library: {library}")

@staticmethod
def _fail(code: str, message: str):
return MaybeDependency(None, [DependencyProblem(code, message)])


class StubImportResolver(BaseImportResolver):

def __init__(self):
Expand Down Expand Up @@ -316,14 +351,24 @@ class DependencyResolver:

def __init__(
self,
library_resolvers: list[BaseLibraryResolver],
notebook_resolver: BaseNotebookResolver,
import_resolvers: list[BaseImportResolver],
path_lookup: PathLookup,
):
self._library_resolver = self._chain_library_resolvers(library_resolvers)
self._notebook_resolver = notebook_resolver
self._import_resolver = self._chain_import_resolvers(import_resolvers)
self._path_lookup = path_lookup

@staticmethod
def _chain_library_resolvers(library_resolvers: list[BaseLibraryResolver]) -> BaseLibraryResolver:
previous: BaseLibraryResolver = StubLibraryResolver()
for resolver in library_resolvers:
resolver = resolver.with_next_resolver(previous)
previous = resolver
return previous

@staticmethod
def _chain_import_resolvers(import_resolvers: list[BaseImportResolver]) -> BaseImportResolver:
previous: BaseImportResolver = StubImportResolver()
Expand All @@ -338,6 +383,9 @@ def resolve_notebook(self, path_lookup: PathLookup, path: Path) -> MaybeDependen
def resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency:
return self._import_resolver.resolve_import(path_lookup, name)

def resolve_library(self, path_lookup: PathLookup, library: str) -> MaybeDependency:
return self._library_resolver.resolve_library(path_lookup, library)

def build_local_file_dependency_graph(self, path: Path) -> MaybeGraph:
"""Builds a dependency graph starting from a file. This method is mainly intended for testing purposes.
In case of problems, the paths in the problems will be relative to the starting path lookup."""
Expand Down
27 changes: 12 additions & 15 deletions src/databricks/labs/ucx/source_code/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
from databricks.labs.ucx.source_code.base import CurrentSessionState
from databricks.labs.ucx.source_code.files import LocalFile
from databricks.labs.ucx.source_code.graph import (
Dependency,
DependencyGraph,
SourceContainer,
DependencyProblem,
Dependency,
WrappingLoader,
DependencyResolver,
SourceContainer,
WrappingLoader,
)
from databricks.labs.ucx.source_code.languages import Languages
from databricks.labs.ucx.source_code.notebooks.sources import Notebook, NotebookLinter, FileLinter
Expand Down Expand Up @@ -76,32 +76,29 @@ def _register_task_dependencies(self, graph: DependencyGraph) -> Iterable[Depend
yield from self._register_existing_cluster_id(graph)
yield from self._register_spark_submit_task(graph)

def _register_libraries(self, graph: DependencyGraph):
def _register_libraries(self, graph: DependencyGraph) -> Iterable[DependencyProblem]:
if not self._task.libraries:
return
for library in self._task.libraries:
yield from self._lint_library(library, graph)
yield from self._register_library(graph, library)

def _lint_library(self, library: compute.Library, graph: DependencyGraph) -> Iterable[DependencyProblem]:
@staticmethod
def _register_library(graph: DependencyGraph, library: compute.Library) -> Iterable[DependencyProblem]:
if library.pypi:
# TODO: https://github.com/databrickslabs/ucx/issues/1642
maybe = graph.register_library(library.pypi.package)
if maybe.problems:
yield from maybe.problems
problems = graph.register_library(library.pypi.package)
if problems:
yield from problems
if library.jar:
# TODO: https://github.com/databrickslabs/ucx/issues/1641
yield DependencyProblem('not-yet-implemented', 'Jar library is not yet implemented')
if library.egg:
# TODO: https://github.com/databrickslabs/ucx/issues/1643
maybe = graph.register_library(library.egg)
if maybe.problems:
yield from maybe.problems
yield DependencyProblem("not-yet-implemented", "Egg library is not yet implemented")
if library.whl:
# TODO: download the wheel somewhere local and add it to "virtual sys.path" via graph.path_lookup.push_path
# TODO: https://github.com/databrickslabs/ucx/issues/1640
maybe = graph.register_library(library.whl)
if maybe.problems:
yield from maybe.problems
yield DependencyProblem("not-yet-implemented", "Wheel library is not yet implemented")
if library.requirements:
# TODO: download and add every entry via graph.register_library
# TODO: https://github.com/databrickslabs/ucx/issues/1644
Expand Down
13 changes: 11 additions & 2 deletions src/databricks/labs/ucx/source_code/notebooks/cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,18 @@ def language(self):
def is_runnable(self) -> bool:
return True # TODO

def build_dependency_graph(self, _: DependencyGraph) -> list[DependencyProblem]:
def build_dependency_graph(self, graph: DependencyGraph) -> list[DependencyProblem]:
# TODO: https://github.com/databrickslabs/ucx/issues/1642
return []
# TODO: this is very basic code, we need to improve it
splits = self.original_code.split(' ')
if len(splits) < 3:
return [DependencyProblem("library-install-failed", f"Missing arguments in '{self.original_code}'")]
if splits[1] != "install":
return [DependencyProblem("library-install-failed", f"Unsupported %pip command: {splits[1]}")]
# TODO: we need to support different formats of the library name and etc
library = splits[2]

return graph.register_library(library)


class CellLanguage(Enum):
Expand Down
45 changes: 45 additions & 0 deletions src/databricks/labs/ucx/source_code/site_packages.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,50 @@
from __future__ import annotations

import os
import subprocess
import tempfile
from functools import cached_property
from pathlib import Path
from subprocess import CalledProcessError

from databricks.labs.ucx.source_code.graph import BaseLibraryResolver, DependencyProblem, MaybeDependency
from databricks.labs.ucx.source_code.path_lookup import PathLookup


class PipResolver(BaseLibraryResolver):
# TODO: use DistInfoResolver to load wheel/egg/pypi dependencies
# TODO: https://github.com/databrickslabs/ucx/issues/1642
# TODO: https://github.com/databrickslabs/ucx/issues/1643
# TODO: https://github.com/databrickslabs/ucx/issues/1640

def __init__(self, next_resolver: BaseLibraryResolver | None = None) -> None:
super().__init__(next_resolver)

def with_next_resolver(self, resolver: BaseLibraryResolver) -> PipResolver:
return PipResolver(resolver)

def resolve_library(self, path_lookup: PathLookup, library: str) -> MaybeDependency:
"""Pip install library and augment path look-up to resolve the library at import"""
# invoke pip install via subprocess to install this library into lib_install_folder
pip_install_arguments = ["pip", "install", library, "-t", self._temporary_virtual_environment.as_posix()]
try:
subprocess.run(pip_install_arguments, check=True)
except CalledProcessError as e:
problem = DependencyProblem("library-install-failed", f"Failed to install {library}: {e}")
return MaybeDependency(None, [problem])

path_lookup.append_path(self._temporary_virtual_environment)
return MaybeDependency(None, [])

@cached_property
def _temporary_virtual_environment(self) -> Path:
# TODO: for `databricks labs ucx lint-local-code`, detect if we already have a virtual environment
# and use that one. See Databricks CLI code for the labs command to see how to detect the virtual
# environment. If we don't have a virtual environment, create a temporary one.
# simulate notebook-scoped virtual environment
lib_install_folder = tempfile.mkdtemp(prefix="ucx-")
return Path(lib_install_folder)


COMMENTED_OUT_FOR_PR_1685 = """
class SitePackageContainer(SourceContainer):
Expand Down Expand Up @@ -42,6 +85,8 @@ def __init__(self, packages: list[SitePackage]):
self._packages[top_level] = package

def __getitem__(self, item: str) -> SitePackage | None:
# TODO: Replace hyphen with underscores
# TODO: Don't use get, raise KeyError
return self._packages.get(item, None)


Expand Down
9 changes: 0 additions & 9 deletions tests/integration/contexts/test_global_context.py

This file was deleted.

File renamed without changes.
12 changes: 12 additions & 0 deletions tests/unit/contexts/test_application.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pytest

from databricks.labs.ucx.contexts.application import GlobalContext


@pytest.mark.parametrize("attribute", ["dependency_resolver", "pip_resolver", "site_packages", "site_packages_path"])
def test_global_context_attributes_not_none(attribute: str):
"""Attributes should be not None"""
# Goal is to improve test coverage
ctx = GlobalContext()
assert hasattr(ctx, attribute)
assert getattr(ctx, attribute) is not None
Empty file.
88 changes: 88 additions & 0 deletions tests/unit/source_code/notebooks/test_cells.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from pathlib import Path
from unittest.mock import create_autospec

from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver
from databricks.labs.ucx.source_code.files import FileLoader
from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage, PipCell
from databricks.labs.ucx.source_code.notebooks.loaders import (
NotebookResolver,
NotebookLoader,
)
from databricks.labs.ucx.source_code.site_packages import PipResolver


def test_pip_cell_language_is_pip():
assert PipCell("code").language == CellLanguage.PIP


def test_pip_cell_build_dependency_graph_invokes_register_library():
graph = create_autospec(DependencyGraph)

code = "%pip install databricks"
cell = PipCell(code)

problems = cell.build_dependency_graph(graph)

assert len(problems) == 0
graph.register_library.assert_called_once_with("databricks")


def test_pip_cell_build_dependency_graph_pip_registers_missing_library():
graph = create_autospec(DependencyGraph)

code = "%pip install"
cell = PipCell(code)

problems = cell.build_dependency_graph(graph)

assert len(problems) == 1
assert problems[0].code == "library-install-failed"
assert problems[0].message == "Missing arguments in '%pip install'"
graph.register_library.assert_not_called()


def test_pip_cell_build_dependency_graph_reports_incorrect_syntax():
graph = create_autospec(DependencyGraph)

code = "%pip installl pytest" # typo on purpose
cell = PipCell(code)

problems = cell.build_dependency_graph(graph)

assert len(problems) == 1
assert problems[0].code == "library-install-failed"
assert problems[0].message == "Unsupported %pip command: installl"
graph.register_library.assert_not_called()


def test_pip_cell_build_dependency_graph_reports_unknown_library(mock_path_lookup):
dependency = Dependency(FileLoader(), Path("test"))
notebook_loader = NotebookLoader()
notebook_resolver = NotebookResolver(notebook_loader)
dependency_resolver = DependencyResolver([PipResolver()], notebook_resolver, [], mock_path_lookup)
graph = DependencyGraph(dependency, None, dependency_resolver, mock_path_lookup)

code = "%pip install unknown-library-name"
cell = PipCell(code)

problems = cell.build_dependency_graph(graph)

assert len(problems) == 1
assert problems[0].code == "library-install-failed"
assert problems[0].message.startswith("Failed to install unknown-library-name")


def test_pip_cell_build_dependency_graph_resolves_installed_library(mock_path_lookup):
dependency = Dependency(FileLoader(), Path("test"))
notebook_loader = NotebookLoader()
notebook_resolver = NotebookResolver(notebook_loader)
dependency_resolver = DependencyResolver([PipResolver()], notebook_resolver, [], mock_path_lookup)
graph = DependencyGraph(dependency, None, dependency_resolver, mock_path_lookup)

code = "%pip install pytest"
cell = PipCell(code)

problems = cell.build_dependency_graph(graph)

assert len(problems) == 0
assert graph.path_lookup.resolve(Path("pytest")).exists()
Loading

0 comments on commit 08aed28

Please sign in to comment.