Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move ExternalLocations and Mounts to locations module #692

Merged
merged 1 commit into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/databricks/labs/ucx/hive_metastore/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from databricks.labs.ucx.hive_metastore.grants import GrantsCrawler
from databricks.labs.ucx.hive_metastore.mounts import Mounts
from databricks.labs.ucx.hive_metastore.locations import ExternalLocations, Mounts
from databricks.labs.ucx.hive_metastore.tables import TablesCrawler

__all__ = ["TablesCrawler", "GrantsCrawler", "Mounts"]
__all__ = ["TablesCrawler", "GrantsCrawler", "Mounts", "ExternalLocations"]
14 changes: 11 additions & 3 deletions src/databricks/labs/ucx/hive_metastore/hms_lineage.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,17 @@
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.compute import GlobalInitScriptDetailsWithContent

from databricks.labs.ucx.hive_metastore.hms_lineage_global_init_script import (
global_init_script,
)
global_init_script = """if [[ $DB_IS_DRIVER = "TRUE" ]]; then
driver_conf=${DB_HOME}/driver/conf/spark-branch.conf
if [ ! -e $driver_conf ] ; then
touch $driver_conf
fi
cat << EOF >> $driver_conf
[driver] {
"spark.databricks.dataLineage.enabled" = true
}
EOF
fi"""


class HiveMetastoreLineageEnabler:
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from databricks.sdk import WorkspaceClient

from databricks.labs.ucx.framework.crawlers import CrawlerBase, SqlBackend
from databricks.labs.ucx.hive_metastore.mounts import Mounts
from databricks.labs.ucx.mixins.sql import Row

logger = logging.getLogger(__name__)
Expand All @@ -18,7 +17,13 @@ class ExternalLocation:
location: str


class ExternalLocationCrawler(CrawlerBase):
@dataclass
class Mount:
name: str
source: str


class ExternalLocations(CrawlerBase):
_prefix_size: typing.ClassVar[list[int]] = [1, 12]

def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema):
Expand Down Expand Up @@ -106,3 +111,36 @@ def snapshot(self) -> list[ExternalLocation]:
def _try_fetch(self) -> list[ExternalLocation]:
for row in self._fetch(f"SELECT * FROM {self._schema}.{self._table}"):
yield ExternalLocation(*row)


class Mounts(CrawlerBase):
def __init__(self, backend: SqlBackend, ws: WorkspaceClient, inventory_database: str):
super().__init__(backend, "hive_metastore", inventory_database, "mounts", Mount)
self._dbutils = ws.dbutils

def _deduplicate_mounts(self, mounts: list) -> list:
seen = set()
deduplicated_mounts = []

for obj in mounts:
obj_tuple = (obj.name, obj.source)
if obj_tuple not in seen:
seen.add(obj_tuple)
deduplicated_mounts.append(obj)
return deduplicated_mounts

def inventorize_mounts(self):
self._append_records(self._list_mounts())

def _list_mounts(self):
mounts = []
for mount_point, source, _ in self._dbutils.fs.mounts():
mounts.append(Mount(mount_point, source))
return self._deduplicate_mounts(mounts)

def snapshot(self) -> list[Mount]:
return self._snapshot(self._try_fetch, self._list_mounts)

def _try_fetch(self) -> list[Mount]:
for row in self._fetch(f"SELECT * FROM {self._schema}.{self._table}"):
yield Mount(*row)
47 changes: 0 additions & 47 deletions src/databricks/labs/ucx/hive_metastore/mounts.py

This file was deleted.

3 changes: 1 addition & 2 deletions src/databricks/labs/ucx/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,9 @@
from databricks.labs.ucx.framework.dashboards import DashboardFromFiles
from databricks.labs.ucx.framework.install_state import InstallState
from databricks.labs.ucx.framework.tasks import _TASKS, Task
from databricks.labs.ucx.hive_metastore.data_objects import ExternalLocation
from databricks.labs.ucx.hive_metastore.grants import Grant
from databricks.labs.ucx.hive_metastore.hms_lineage import HiveMetastoreLineageEnabler
from databricks.labs.ucx.hive_metastore.mounts import Mount
from databricks.labs.ucx.hive_metastore.locations import ExternalLocation, Mount
from databricks.labs.ucx.hive_metastore.tables import Table, TableError
from databricks.labs.ucx.runtime import main
from databricks.labs.ucx.workspace_access.base import Permissions
Expand Down
11 changes: 7 additions & 4 deletions src/databricks/labs/ucx/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@
from databricks.labs.ucx.config import WorkspaceConfig
from databricks.labs.ucx.framework.crawlers import RuntimeBackend
from databricks.labs.ucx.framework.tasks import task, trigger
from databricks.labs.ucx.hive_metastore import GrantsCrawler, TablesCrawler
from databricks.labs.ucx.hive_metastore.data_objects import ExternalLocationCrawler
from databricks.labs.ucx.hive_metastore.mounts import Mounts
from databricks.labs.ucx.hive_metastore import (
ExternalLocations,
GrantsCrawler,
Mounts,
TablesCrawler,
)
from databricks.labs.ucx.workspace_access.generic import WorkspaceListing
from databricks.labs.ucx.workspace_access.groups import GroupManager
from databricks.labs.ucx.workspace_access.manager import PermissionManager
Expand Down Expand Up @@ -78,7 +81,7 @@ def guess_external_locations(cfg: WorkspaceConfig):
- Scanning all these locations to identify folders that can act as shared path prefixes
- These identified external locations will be created subsequently prior to the actual table migration"""
ws = WorkspaceClient(config=cfg.to_databricks_config())
crawler = ExternalLocationCrawler(ws, RuntimeBackend(), cfg.inventory_database)
crawler = ExternalLocations(ws, RuntimeBackend(), cfg.inventory_database)
crawler.snapshot()


Expand Down
6 changes: 3 additions & 3 deletions tests/integration/hive_metastore/test_external_locations.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging

from databricks.labs.ucx.hive_metastore.data_objects import ExternalLocationCrawler
from databricks.labs.ucx.hive_metastore.mounts import Mount
from databricks.labs.ucx.hive_metastore.data_objects import ExternalLocations
from databricks.labs.ucx.hive_metastore.locations import Mount
from databricks.labs.ucx.hive_metastore.tables import Table

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -59,7 +59,7 @@ def test_external_locations(ws, sql_backend, inventory_schema, env_or_skip):
sql_backend.save_table(f"{inventory_schema}.tables", tables, Table)
sql_backend.save_table(f"{inventory_schema}.mounts", [Mount("/mnt/foo", "s3://bar")], Mount)

crawler = ExternalLocationCrawler(ws, sql_backend, inventory_schema)
crawler = ExternalLocations(ws, sql_backend, inventory_schema)
results = crawler.snapshot()
assert len(results) == 6
assert results[1].location == "s3://bar/test3/"
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/hive_metastore/test_mounts.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

from databricks.labs.ucx.hive_metastore.mounts import Mount
from databricks.labs.ucx.hive_metastore.locations import Mount
from databricks.labs.ucx.mixins.compute import CommandExecutor


Expand Down
59 changes: 0 additions & 59 deletions tests/unit/assessment/test_assessment.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,6 @@
PipelinesCrawler,
spark_version_compatibility,
)
from databricks.labs.ucx.hive_metastore.data_objects import ExternalLocationCrawler
from databricks.labs.ucx.hive_metastore.mounts import Mount
from databricks.labs.ucx.mixins.sql import Row
from tests.unit.framework.mocks import MockBackend

_SECRET_PATTERN = r"{{(secrets.*?)}}"
Expand All @@ -63,62 +60,6 @@ def test_spark_version_compatibility():
assert "unsupported" == spark_version_compatibility("x14.1.x-photon-scala2.12")


def test_external_locations():
crawler = ExternalLocationCrawler(Mock(), MockBackend(), "test")
row_factory = type("Row", (Row,), {"__columns__": ["location", "storage_properties"]})
sample_locations = [
row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location/Table", ""]),
row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location/Table2", ""]),
row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/testloc/Table3", ""]),
row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/anotherloc/Table4", ""]),
row_factory(["dbfs:/mnt/ucx/database1/table1", ""]),
row_factory(["dbfs:/mnt/ucx/database2/table2", ""]),
row_factory(["DatabricksRootmntDatabricksRoot", ""]),
row_factory(
[
"jdbc:databricks://",
"[personalAccessToken=*********(redacted), \
httpPath=/sql/1.0/warehouses/65b52fb5bd86a7be, host=dbc-test1-aa11.cloud.databricks.com, \
dbtable=samples.nyctaxi.trips]",
]
),
row_factory(
[
"jdbc:/MYSQL",
"[database=test_db, host=somemysql.us-east-1.rds.amazonaws.com, \
port=3306, dbtable=movies, user=*********(redacted), password=*********(redacted)]",
]
),
row_factory(
[
"jdbc:providerknown:/",
"[database=test_db, host=somedb.us-east-1.rds.amazonaws.com, \
port=1234, dbtable=sometable, user=*********(redacted), password=*********(redacted), \
provider=providerknown]",
]
),
row_factory(
[
"jdbc:providerunknown:/",
"[database=test_db, host=somedb.us-east-1.rds.amazonaws.com, \
port=1234, dbtable=sometable, user=*********(redacted), password=*********(redacted)]",
]
),
]
sample_mounts = [Mount("/mnt/ucx", "s3://us-east-1-ucx-container")]
result_set = crawler._external_locations(sample_locations, sample_mounts)
assert len(result_set) == 7
assert result_set[0].location == "s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location/"
assert result_set[1].location == "s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/"
assert (
result_set[3].location
== "jdbc:databricks://dbc-test1-aa11.cloud.databricks.com;httpPath=/sql/1.0/warehouses/65b52fb5bd86a7be"
)
assert result_set[4].location == "jdbc:mysql://somemysql.us-east-1.rds.amazonaws.com:3306/test_db"
assert result_set[5].location == "jdbc:providerknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db"
assert result_set[6].location == "jdbc:providerunknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db"


def test_job_assessment():
sample_jobs = [
BaseJob(
Expand Down
101 changes: 101 additions & 0 deletions tests/unit/hive_metastore/test_locations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from unittest.mock import MagicMock, Mock

from databricks.sdk.dbutils import MountInfo

from databricks.labs.ucx.hive_metastore.locations import (
ExternalLocations,
Mount,
Mounts,
)
from databricks.labs.ucx.mixins.sql import Row
from tests.unit.framework.mocks import MockBackend


def test_list_mounts_should_return_a_list_of_mount_without_encryption_type():
client = MagicMock()
client.dbutils.fs.mounts.return_value = [
MountInfo("mp_1", "path_1", "info_1"),
MountInfo("mp_2", "path_2", "info_2"),
MountInfo("mp_3", "path_3", "info_3"),
]

backend = MockBackend()
instance = Mounts(backend, client, "test")

instance.inventorize_mounts()

expected = [Mount("mp_1", "path_1"), Mount("mp_2", "path_2"), Mount("mp_3", "path_3")]
assert expected == backend.rows_written_for("hive_metastore.test.mounts", "append")


def test_list_mounts_should_return_a_deduped_list_of_mount_without_encryption_type():
client = MagicMock()
client.dbutils.fs.mounts.return_value = [
MountInfo("mp_1", "path_1", "info_1"),
MountInfo("mp_2", "path_2", "info_2"),
MountInfo("mp_2", "path_2", "info_2"),
]

backend = MockBackend()
instance = Mounts(backend, client, "test")

instance.inventorize_mounts()

expected = [Mount("mp_1", "path_1"), Mount("mp_2", "path_2")]
assert expected == backend.rows_written_for("hive_metastore.test.mounts", "append")


def test_external_locations():
crawler = ExternalLocations(Mock(), MockBackend(), "test")
row_factory = type("Row", (Row,), {"__columns__": ["location", "storage_properties"]})
sample_locations = [
row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location/Table", ""]),
row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location/Table2", ""]),
row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/testloc/Table3", ""]),
row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/anotherloc/Table4", ""]),
row_factory(["dbfs:/mnt/ucx/database1/table1", ""]),
row_factory(["dbfs:/mnt/ucx/database2/table2", ""]),
row_factory(["DatabricksRootmntDatabricksRoot", ""]),
row_factory(
[
"jdbc:databricks://",
"[personalAccessToken=*********(redacted), \
httpPath=/sql/1.0/warehouses/65b52fb5bd86a7be, host=dbc-test1-aa11.cloud.databricks.com, \
dbtable=samples.nyctaxi.trips]",
]
),
row_factory(
[
"jdbc:/MYSQL",
"[database=test_db, host=somemysql.us-east-1.rds.amazonaws.com, \
port=3306, dbtable=movies, user=*********(redacted), password=*********(redacted)]",
]
),
row_factory(
[
"jdbc:providerknown:/",
"[database=test_db, host=somedb.us-east-1.rds.amazonaws.com, \
port=1234, dbtable=sometable, user=*********(redacted), password=*********(redacted), \
provider=providerknown]",
]
),
row_factory(
[
"jdbc:providerunknown:/",
"[database=test_db, host=somedb.us-east-1.rds.amazonaws.com, \
port=1234, dbtable=sometable, user=*********(redacted), password=*********(redacted)]",
]
),
]
sample_mounts = [Mount("/mnt/ucx", "s3://us-east-1-ucx-container")]
result_set = crawler._external_locations(sample_locations, sample_mounts)
assert len(result_set) == 7
assert result_set[0].location == "s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location/"
assert result_set[1].location == "s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/"
assert (
result_set[3].location
== "jdbc:databricks://dbc-test1-aa11.cloud.databricks.com;httpPath=/sql/1.0/warehouses/65b52fb5bd86a7be"
)
assert result_set[4].location == "jdbc:mysql://somemysql.us-east-1.rds.amazonaws.com:3306/test_db"
assert result_set[5].location == "jdbc:providerknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db"
assert result_set[6].location == "jdbc:providerunknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db"
Loading
Loading