Skip to content

Commit

Permalink
Added upstream wheel uploads for Databricks Workspaces without Public…
Browse files Browse the repository at this point in the history
… Internet access (#99)

The output of the build wheel function will be a list of all the
dependent wheel packages. Upload functions will get a new flag as input
to either include the dependencies in the download list or just limit
the download to the main wheel package.

Relates to databrickslabs/ucx#573

---------

Co-authored-by: Serge Smertin <259697+nfx@users.noreply.github.com>
  • Loading branch information
aminmovahed-db and nfx authored May 12, 2024
1 parent c959367 commit 50b5474
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 9 deletions.
27 changes: 27 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -933,6 +933,33 @@ This will print something like:

You can also do `wheels.upload_to_dbfs()`, though you're not able to set any access control over it.

### Publishing upstream dependencies to Databricks Workspace without Public Internet access

Python wheel may have dependencies that are not included in the wheel itself. These dependencies are usually other Python packages that your wheel relies on. During installation on regular Databricks Workspaces, these dependencies get automatically fetched from [Python Package Index](https://pypi.org/).

Some Databricks Workspaces are configured with extra layers of network security, that block all access to Public Internet, including [Python Package Index](https://pypi.org/). To ensure installations working on these kinds of workspaces, developers need to explicitly upload all upstream dependencies for their applications to work correctly.

The `upload_wheel_dependencies(prefixes)` method can be used to upload these dependencies to Databricks Workspace. This method takes a list of prefixes as an argument. It will upload all the dependencies of the wheel that have names starting with any of the provided prefixes.

Here is an example of how you can use this method:

```python
from databricks.sdk import WorkspaceClient
from databricks.labs.blueprint.wheels import ProductInfo

ws = WorkspaceClient()
product_info = ProductInfo(__file__)
installation = product_info.current_installation(ws)

with product_info.wheels(ws) as wheels:
wheel_paths = wheels.upload_wheel_dependencies(['databricks_sdk', 'pandas'])
for path in wheel_paths:
print(f'Uploaded dependency to {path}')
```

In this example, the `upload_wheel_dependencies(['databricks_sdk', 'pandas'])` call will upload all the dependencies of the wheel that have names starting with 'databricks_sdk' or 'pandas'. This method excludes any platform specific dependencies (i.e. ending with `-none-any.whl`). Also the main wheel file is not uploaded. The method returns a list of paths to the uploaded dependencies on WorkspaceFS.


[[back to top](#databricks-labs-blueprint)]

## Databricks CLI's `databricks labs ...` Router
Expand Down
39 changes: 30 additions & 9 deletions src/databricks/labs/blueprint/wheels.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,24 @@ def upload_to_wsfs(self) -> str:
self._installation.save(Version(self._product_info.version(), remote_wheel, self._now_iso()))
return remote_wheel

def upload_wheel_dependencies(self, prefixes: list[str]) -> list[str]:
"""Uploads the wheel dependencies to WSFS location of installation and returns the remote paths.
:param prefixes : A list of prefixes to match against the wheel names. If a prefix matches, the wheel is uploaded.
"""
remote_paths = []
for wheel in self._build_wheel(self._tmp_dir.name, verbose=self._verbose, no_deps=False, dirs_exist_ok=True):
if not wheel.name.endswith("-none-any.whl"):
continue
# main wheel is uploaded with upload_to_wsfs() method.
if wheel.name == self._local_wheel.name:
continue
for prefix in prefixes:
if not wheel.name.startswith(prefix):
continue
remote_wheel = self._installation.upload(f"wheels/{wheel.name}", wheel.read_bytes())
remote_paths.append(remote_wheel)
return remote_paths

@staticmethod
def _now_iso():
"""Returns the current time in ISO format."""
Expand All @@ -240,20 +258,21 @@ def _now_iso():
def __enter__(self) -> "WheelsV2":
"""Builds the wheel and returns the instance. Use it as a context manager."""
self._tmp_dir = tempfile.TemporaryDirectory()
self._local_wheel = self._build_wheel(self._tmp_dir.name, verbose=self._verbose)
self._local_wheel = next(self._build_wheel(self._tmp_dir.name, verbose=self._verbose, no_deps=True))
return self

def __exit__(self, __exc_type, __exc_value, __traceback):
"""Cleans up the temporary directory. Use it as a context manager."""
self._tmp_dir.cleanup()

def _build_wheel(self, tmp_dir: str, *, verbose: bool = False):
def _build_wheel(self, tmp_dir: str, *, verbose: bool = False, no_deps: bool = True, dirs_exist_ok: bool = False):
"""Helper to build the wheel package
:param tmp_dir: str:
:param *:
:param verbose: bool: (Default value = False)
:param no_deps: bool: (Default value = True)
:param dirs_exist_ok: bool: (Default value = False)
"""
stdout = subprocess.STDOUT
stderr = subprocess.STDOUT
Expand All @@ -263,18 +282,20 @@ def _build_wheel(self, tmp_dir: str, *, verbose: bool = False):
checkout_root = self._product_info.checkout_root()
if self._product_info.is_git_checkout() and self._product_info.is_unreleased_version():
# working copy becomes project root for building a wheel
checkout_root = self._copy_root_to(tmp_dir)
checkout_root = self._copy_root_to(tmp_dir, dirs_exist_ok)
# and override the version file
self._override_version_to_unreleased(checkout_root)
args = [sys.executable, "-m", "pip", "wheel", "--wheel-dir", tmp_dir, checkout_root.as_posix()]
logger.debug(f"Building wheel for {checkout_root} in {tmp_dir}")
if no_deps:
args.append("--no-deps")
subprocess.run(
[sys.executable, "-m", "pip", "wheel", "--no-deps", "--wheel-dir", tmp_dir, checkout_root.as_posix()],
args,
check=True,
stdout=stdout,
stderr=stderr,
)
# get wheel name as first file in the temp directory
return next(Path(tmp_dir).glob("*.whl"))
return Path(tmp_dir).glob("*.whl")

def _override_version_to_unreleased(self, tmp_dir_path: Path):
"""Overrides the version file to unreleased version."""
Expand All @@ -284,7 +305,7 @@ def _override_version_to_unreleased(self, tmp_dir_path: Path):
with version_file.open("w") as f:
f.write(f'__version__ = "{self._product_info.version()}"')

def _copy_root_to(self, tmp_dir: str | Path):
def _copy_root_to(self, tmp_dir: str | Path, dirs_exist_ok: bool = False):
"""Copies the root to a temporary directory."""
checkout_root = self._product_info.checkout_root()
tmp_dir_path = Path(tmp_dir) / "working-copy"
Expand All @@ -299,7 +320,7 @@ def copy_ignore(_, names: list[str]):
ignored_names.append(name)
return ignored_names

shutil.copytree(checkout_root, tmp_dir_path, ignore=copy_ignore)
shutil.copytree(checkout_root, tmp_dir_path, ignore=copy_ignore, dirs_exist_ok=dirs_exist_ok)
return tmp_dir_path


Expand Down
3 changes: 3 additions & 0 deletions tests/integration/test_wheels.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,6 @@ def test_upload_dbfs(ws, new_installation):
with WheelsV2(new_installation, product_info) as whl:
remote_wheel = whl.upload_to_dbfs()
ws.dbfs.get_status(remote_wheel)


# TODO: to add an integration test for upload_wheel_dependencies (currently getting an access issue to the test environment)
11 changes: 11 additions & 0 deletions tests/unit/test_wheels.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,17 @@ def test_build_and_upload_wheel():
assert not os.path.exists(wheels._local_wheel)


def test_build_and_dependencies_upload_wheel():
installation = MockInstallation()
product_info = ProductInfo.from_class(MockInstallation)

wheels = WheelsV2(installation, product_info)
with wheels:
wheel_paths = wheels.upload_wheel_dependencies(["databricks_sdk"])
assert len(wheel_paths) == 1
installation.assert_file_uploaded(re.compile("wheels/databricks_sdk-*"))


def test_unreleased_version(tmp_path):
if not is_in_debug():
pytest.skip("fails without `git fetch --prune --unshallow` configured")
Expand Down

0 comments on commit 50b5474

Please sign in to comment.