diff --git a/src/databricks/labs/ucx/hive_metastore/__init__.py b/src/databricks/labs/ucx/hive_metastore/__init__.py index b479a1edaf..baba5b9564 100644 --- a/src/databricks/labs/ucx/hive_metastore/__init__.py +++ b/src/databricks/labs/ucx/hive_metastore/__init__.py @@ -1,5 +1,5 @@ from databricks.labs.ucx.hive_metastore.grants import GrantsCrawler -from databricks.labs.ucx.hive_metastore.mounts import Mounts +from databricks.labs.ucx.hive_metastore.locations import ExternalLocations, Mounts from databricks.labs.ucx.hive_metastore.tables import TablesCrawler -__all__ = ["TablesCrawler", "GrantsCrawler", "Mounts"] +__all__ = ["TablesCrawler", "GrantsCrawler", "Mounts", "ExternalLocations"] diff --git a/src/databricks/labs/ucx/hive_metastore/hms_lineage.py b/src/databricks/labs/ucx/hive_metastore/hms_lineage.py index 7c439de69c..fcdfcacc1c 100644 --- a/src/databricks/labs/ucx/hive_metastore/hms_lineage.py +++ b/src/databricks/labs/ucx/hive_metastore/hms_lineage.py @@ -4,9 +4,17 @@ from databricks.sdk import WorkspaceClient from databricks.sdk.service.compute import GlobalInitScriptDetailsWithContent -from databricks.labs.ucx.hive_metastore.hms_lineage_global_init_script import ( - global_init_script, -) +global_init_script = """if [[ $DB_IS_DRIVER = "TRUE" ]]; then + driver_conf=${DB_HOME}/driver/conf/spark-branch.conf + if [ ! -e $driver_conf ] ; then + touch $driver_conf + fi +cat << EOF >> $driver_conf + [driver] { + "spark.databricks.dataLineage.enabled" = true + } +EOF +fi""" class HiveMetastoreLineageEnabler: diff --git a/src/databricks/labs/ucx/hive_metastore/hms_lineage_global_init_script.py b/src/databricks/labs/ucx/hive_metastore/hms_lineage_global_init_script.py deleted file mode 100644 index cd08497b5d..0000000000 --- a/src/databricks/labs/ucx/hive_metastore/hms_lineage_global_init_script.py +++ /dev/null @@ -1,11 +0,0 @@ -global_init_script = """if [[ $DB_IS_DRIVER = "TRUE" ]]; then - driver_conf=${DB_HOME}/driver/conf/spark-branch.conf - if [ ! -e $driver_conf ] ; then - touch $driver_conf - fi -cat << EOF >> $driver_conf - [driver] { - "spark.databricks.dataLineage.enabled" = true - } -EOF -fi""" diff --git a/src/databricks/labs/ucx/hive_metastore/data_objects.py b/src/databricks/labs/ucx/hive_metastore/locations.py similarity index 79% rename from src/databricks/labs/ucx/hive_metastore/data_objects.py rename to src/databricks/labs/ucx/hive_metastore/locations.py index 7027d5a332..3475760727 100644 --- a/src/databricks/labs/ucx/hive_metastore/data_objects.py +++ b/src/databricks/labs/ucx/hive_metastore/locations.py @@ -7,7 +7,6 @@ from databricks.sdk import WorkspaceClient from databricks.labs.ucx.framework.crawlers import CrawlerBase, SqlBackend -from databricks.labs.ucx.hive_metastore.mounts import Mounts from databricks.labs.ucx.mixins.sql import Row logger = logging.getLogger(__name__) @@ -18,7 +17,13 @@ class ExternalLocation: location: str -class ExternalLocationCrawler(CrawlerBase): +@dataclass +class Mount: + name: str + source: str + + +class ExternalLocations(CrawlerBase): _prefix_size: typing.ClassVar[list[int]] = [1, 12] def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema): @@ -106,3 +111,36 @@ def snapshot(self) -> list[ExternalLocation]: def _try_fetch(self) -> list[ExternalLocation]: for row in self._fetch(f"SELECT * FROM {self._schema}.{self._table}"): yield ExternalLocation(*row) + + +class Mounts(CrawlerBase): + def __init__(self, backend: SqlBackend, ws: WorkspaceClient, inventory_database: str): + super().__init__(backend, "hive_metastore", inventory_database, "mounts", Mount) + self._dbutils = ws.dbutils + + def _deduplicate_mounts(self, mounts: list) -> list: + seen = set() + deduplicated_mounts = [] + + for obj in mounts: + obj_tuple = (obj.name, obj.source) + if obj_tuple not in seen: + seen.add(obj_tuple) + deduplicated_mounts.append(obj) + return deduplicated_mounts + + def inventorize_mounts(self): + self._append_records(self._list_mounts()) + + def _list_mounts(self): + mounts = [] + for mount_point, source, _ in self._dbutils.fs.mounts(): + mounts.append(Mount(mount_point, source)) + return self._deduplicate_mounts(mounts) + + def snapshot(self) -> list[Mount]: + return self._snapshot(self._try_fetch, self._list_mounts) + + def _try_fetch(self) -> list[Mount]: + for row in self._fetch(f"SELECT * FROM {self._schema}.{self._table}"): + yield Mount(*row) diff --git a/src/databricks/labs/ucx/hive_metastore/mounts.py b/src/databricks/labs/ucx/hive_metastore/mounts.py deleted file mode 100644 index d517906405..0000000000 --- a/src/databricks/labs/ucx/hive_metastore/mounts.py +++ /dev/null @@ -1,47 +0,0 @@ -import logging -from dataclasses import dataclass - -from databricks.sdk import WorkspaceClient - -from databricks.labs.ucx.framework.crawlers import CrawlerBase, SqlBackend - -logger = logging.getLogger(__name__) - - -@dataclass -class Mount: - name: str - source: str - - -class Mounts(CrawlerBase): - def __init__(self, backend: SqlBackend, ws: WorkspaceClient, inventory_database: str): - super().__init__(backend, "hive_metastore", inventory_database, "mounts", Mount) - self._dbutils = ws.dbutils - - def _deduplicate_mounts(self, mounts: list) -> list: - seen = set() - deduplicated_mounts = [] - - for obj in mounts: - obj_tuple = (obj.name, obj.source) - if obj_tuple not in seen: - seen.add(obj_tuple) - deduplicated_mounts.append(obj) - return deduplicated_mounts - - def inventorize_mounts(self): - self._append_records(self._list_mounts()) - - def _list_mounts(self): - mounts = [] - for mount_point, source, _ in self._dbutils.fs.mounts(): - mounts.append(Mount(mount_point, source)) - return self._deduplicate_mounts(mounts) - - def snapshot(self) -> list[Mount]: - return self._snapshot(self._try_fetch, self._list_mounts) - - def _try_fetch(self) -> list[Mount]: - for row in self._fetch(f"SELECT * FROM {self._schema}.{self._table}"): - yield Mount(*row) diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py index 4cc5aba78a..abae55a93d 100644 --- a/src/databricks/labs/ucx/install.py +++ b/src/databricks/labs/ucx/install.py @@ -44,10 +44,9 @@ from databricks.labs.ucx.framework.dashboards import DashboardFromFiles from databricks.labs.ucx.framework.install_state import InstallState from databricks.labs.ucx.framework.tasks import _TASKS, Task -from databricks.labs.ucx.hive_metastore.data_objects import ExternalLocation from databricks.labs.ucx.hive_metastore.grants import Grant from databricks.labs.ucx.hive_metastore.hms_lineage import HiveMetastoreLineageEnabler -from databricks.labs.ucx.hive_metastore.mounts import Mount +from databricks.labs.ucx.hive_metastore.locations import ExternalLocation, Mount from databricks.labs.ucx.hive_metastore.tables import Table, TableError from databricks.labs.ucx.runtime import main from databricks.labs.ucx.workspace_access.base import Permissions diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py index c9a7145d68..d1d9c9e0a2 100644 --- a/src/databricks/labs/ucx/runtime.py +++ b/src/databricks/labs/ucx/runtime.py @@ -14,9 +14,12 @@ from databricks.labs.ucx.config import WorkspaceConfig from databricks.labs.ucx.framework.crawlers import RuntimeBackend from databricks.labs.ucx.framework.tasks import task, trigger -from databricks.labs.ucx.hive_metastore import GrantsCrawler, TablesCrawler -from databricks.labs.ucx.hive_metastore.data_objects import ExternalLocationCrawler -from databricks.labs.ucx.hive_metastore.mounts import Mounts +from databricks.labs.ucx.hive_metastore import ( + ExternalLocations, + GrantsCrawler, + Mounts, + TablesCrawler, +) from databricks.labs.ucx.workspace_access.generic import WorkspaceListing from databricks.labs.ucx.workspace_access.groups import GroupManager from databricks.labs.ucx.workspace_access.manager import PermissionManager @@ -78,7 +81,7 @@ def guess_external_locations(cfg: WorkspaceConfig): - Scanning all these locations to identify folders that can act as shared path prefixes - These identified external locations will be created subsequently prior to the actual table migration""" ws = WorkspaceClient(config=cfg.to_databricks_config()) - crawler = ExternalLocationCrawler(ws, RuntimeBackend(), cfg.inventory_database) + crawler = ExternalLocations(ws, RuntimeBackend(), cfg.inventory_database) crawler.snapshot() diff --git a/tests/integration/hive_metastore/test_external_locations.py b/tests/integration/hive_metastore/test_external_locations.py index 45bc3a5232..7581831a48 100644 --- a/tests/integration/hive_metastore/test_external_locations.py +++ b/tests/integration/hive_metastore/test_external_locations.py @@ -1,7 +1,7 @@ import logging -from databricks.labs.ucx.hive_metastore.data_objects import ExternalLocationCrawler -from databricks.labs.ucx.hive_metastore.mounts import Mount +from databricks.labs.ucx.hive_metastore.data_objects import ExternalLocations +from databricks.labs.ucx.hive_metastore.locations import Mount from databricks.labs.ucx.hive_metastore.tables import Table logger = logging.getLogger(__name__) @@ -59,7 +59,7 @@ def test_external_locations(ws, sql_backend, inventory_schema, env_or_skip): sql_backend.save_table(f"{inventory_schema}.tables", tables, Table) sql_backend.save_table(f"{inventory_schema}.mounts", [Mount("/mnt/foo", "s3://bar")], Mount) - crawler = ExternalLocationCrawler(ws, sql_backend, inventory_schema) + crawler = ExternalLocations(ws, sql_backend, inventory_schema) results = crawler.snapshot() assert len(results) == 6 assert results[1].location == "s3://bar/test3/" diff --git a/tests/integration/hive_metastore/test_mounts.py b/tests/integration/hive_metastore/test_mounts.py index e330592c4c..961405d03b 100644 --- a/tests/integration/hive_metastore/test_mounts.py +++ b/tests/integration/hive_metastore/test_mounts.py @@ -1,6 +1,6 @@ import pytest -from databricks.labs.ucx.hive_metastore.mounts import Mount +from databricks.labs.ucx.hive_metastore.locations import Mount from databricks.labs.ucx.mixins.compute import CommandExecutor diff --git a/tests/unit/assessment/test_assessment.py b/tests/unit/assessment/test_assessment.py index 134723084a..7ad3aa4040 100644 --- a/tests/unit/assessment/test_assessment.py +++ b/tests/unit/assessment/test_assessment.py @@ -38,9 +38,6 @@ PipelinesCrawler, spark_version_compatibility, ) -from databricks.labs.ucx.hive_metastore.data_objects import ExternalLocationCrawler -from databricks.labs.ucx.hive_metastore.mounts import Mount -from databricks.labs.ucx.mixins.sql import Row from tests.unit.framework.mocks import MockBackend _SECRET_PATTERN = r"{{(secrets.*?)}}" @@ -63,62 +60,6 @@ def test_spark_version_compatibility(): assert "unsupported" == spark_version_compatibility("x14.1.x-photon-scala2.12") -def test_external_locations(): - crawler = ExternalLocationCrawler(Mock(), MockBackend(), "test") - row_factory = type("Row", (Row,), {"__columns__": ["location", "storage_properties"]}) - sample_locations = [ - row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location/Table", ""]), - row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location/Table2", ""]), - row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/testloc/Table3", ""]), - row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/anotherloc/Table4", ""]), - row_factory(["dbfs:/mnt/ucx/database1/table1", ""]), - row_factory(["dbfs:/mnt/ucx/database2/table2", ""]), - row_factory(["DatabricksRootmntDatabricksRoot", ""]), - row_factory( - [ - "jdbc:databricks://", - "[personalAccessToken=*********(redacted), \ - httpPath=/sql/1.0/warehouses/65b52fb5bd86a7be, host=dbc-test1-aa11.cloud.databricks.com, \ - dbtable=samples.nyctaxi.trips]", - ] - ), - row_factory( - [ - "jdbc:/MYSQL", - "[database=test_db, host=somemysql.us-east-1.rds.amazonaws.com, \ - port=3306, dbtable=movies, user=*********(redacted), password=*********(redacted)]", - ] - ), - row_factory( - [ - "jdbc:providerknown:/", - "[database=test_db, host=somedb.us-east-1.rds.amazonaws.com, \ - port=1234, dbtable=sometable, user=*********(redacted), password=*********(redacted), \ - provider=providerknown]", - ] - ), - row_factory( - [ - "jdbc:providerunknown:/", - "[database=test_db, host=somedb.us-east-1.rds.amazonaws.com, \ - port=1234, dbtable=sometable, user=*********(redacted), password=*********(redacted)]", - ] - ), - ] - sample_mounts = [Mount("/mnt/ucx", "s3://us-east-1-ucx-container")] - result_set = crawler._external_locations(sample_locations, sample_mounts) - assert len(result_set) == 7 - assert result_set[0].location == "s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location/" - assert result_set[1].location == "s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/" - assert ( - result_set[3].location - == "jdbc:databricks://dbc-test1-aa11.cloud.databricks.com;httpPath=/sql/1.0/warehouses/65b52fb5bd86a7be" - ) - assert result_set[4].location == "jdbc:mysql://somemysql.us-east-1.rds.amazonaws.com:3306/test_db" - assert result_set[5].location == "jdbc:providerknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db" - assert result_set[6].location == "jdbc:providerunknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db" - - def test_job_assessment(): sample_jobs = [ BaseJob( diff --git a/tests/unit/hive_metastore/test_locations.py b/tests/unit/hive_metastore/test_locations.py new file mode 100644 index 0000000000..0ac57f6ef4 --- /dev/null +++ b/tests/unit/hive_metastore/test_locations.py @@ -0,0 +1,101 @@ +from unittest.mock import MagicMock, Mock + +from databricks.sdk.dbutils import MountInfo + +from databricks.labs.ucx.hive_metastore.locations import ( + ExternalLocations, + Mount, + Mounts, +) +from databricks.labs.ucx.mixins.sql import Row +from tests.unit.framework.mocks import MockBackend + + +def test_list_mounts_should_return_a_list_of_mount_without_encryption_type(): + client = MagicMock() + client.dbutils.fs.mounts.return_value = [ + MountInfo("mp_1", "path_1", "info_1"), + MountInfo("mp_2", "path_2", "info_2"), + MountInfo("mp_3", "path_3", "info_3"), + ] + + backend = MockBackend() + instance = Mounts(backend, client, "test") + + instance.inventorize_mounts() + + expected = [Mount("mp_1", "path_1"), Mount("mp_2", "path_2"), Mount("mp_3", "path_3")] + assert expected == backend.rows_written_for("hive_metastore.test.mounts", "append") + + +def test_list_mounts_should_return_a_deduped_list_of_mount_without_encryption_type(): + client = MagicMock() + client.dbutils.fs.mounts.return_value = [ + MountInfo("mp_1", "path_1", "info_1"), + MountInfo("mp_2", "path_2", "info_2"), + MountInfo("mp_2", "path_2", "info_2"), + ] + + backend = MockBackend() + instance = Mounts(backend, client, "test") + + instance.inventorize_mounts() + + expected = [Mount("mp_1", "path_1"), Mount("mp_2", "path_2")] + assert expected == backend.rows_written_for("hive_metastore.test.mounts", "append") + + +def test_external_locations(): + crawler = ExternalLocations(Mock(), MockBackend(), "test") + row_factory = type("Row", (Row,), {"__columns__": ["location", "storage_properties"]}) + sample_locations = [ + row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location/Table", ""]), + row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location/Table2", ""]), + row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/testloc/Table3", ""]), + row_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/anotherloc/Table4", ""]), + row_factory(["dbfs:/mnt/ucx/database1/table1", ""]), + row_factory(["dbfs:/mnt/ucx/database2/table2", ""]), + row_factory(["DatabricksRootmntDatabricksRoot", ""]), + row_factory( + [ + "jdbc:databricks://", + "[personalAccessToken=*********(redacted), \ + httpPath=/sql/1.0/warehouses/65b52fb5bd86a7be, host=dbc-test1-aa11.cloud.databricks.com, \ + dbtable=samples.nyctaxi.trips]", + ] + ), + row_factory( + [ + "jdbc:/MYSQL", + "[database=test_db, host=somemysql.us-east-1.rds.amazonaws.com, \ + port=3306, dbtable=movies, user=*********(redacted), password=*********(redacted)]", + ] + ), + row_factory( + [ + "jdbc:providerknown:/", + "[database=test_db, host=somedb.us-east-1.rds.amazonaws.com, \ + port=1234, dbtable=sometable, user=*********(redacted), password=*********(redacted), \ + provider=providerknown]", + ] + ), + row_factory( + [ + "jdbc:providerunknown:/", + "[database=test_db, host=somedb.us-east-1.rds.amazonaws.com, \ + port=1234, dbtable=sometable, user=*********(redacted), password=*********(redacted)]", + ] + ), + ] + sample_mounts = [Mount("/mnt/ucx", "s3://us-east-1-ucx-container")] + result_set = crawler._external_locations(sample_locations, sample_mounts) + assert len(result_set) == 7 + assert result_set[0].location == "s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location/" + assert result_set[1].location == "s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/" + assert ( + result_set[3].location + == "jdbc:databricks://dbc-test1-aa11.cloud.databricks.com;httpPath=/sql/1.0/warehouses/65b52fb5bd86a7be" + ) + assert result_set[4].location == "jdbc:mysql://somemysql.us-east-1.rds.amazonaws.com:3306/test_db" + assert result_set[5].location == "jdbc:providerknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db" + assert result_set[6].location == "jdbc:providerunknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db" diff --git a/tests/unit/hive_metastore/test_mounts.py b/tests/unit/hive_metastore/test_mounts.py deleted file mode 100644 index 049743b7dd..0000000000 --- a/tests/unit/hive_metastore/test_mounts.py +++ /dev/null @@ -1,40 +0,0 @@ -from unittest.mock import MagicMock - -from databricks.sdk.dbutils import MountInfo - -from databricks.labs.ucx.hive_metastore.mounts import Mount, Mounts -from tests.unit.framework.mocks import MockBackend - - -def test_list_mounts_should_return_a_list_of_mount_without_encryption_type(): - client = MagicMock() - client.dbutils.fs.mounts.return_value = [ - MountInfo("mp_1", "path_1", "info_1"), - MountInfo("mp_2", "path_2", "info_2"), - MountInfo("mp_3", "path_3", "info_3"), - ] - - backend = MockBackend() - instance = Mounts(backend, client, "test") - - instance.inventorize_mounts() - - expected = [Mount("mp_1", "path_1"), Mount("mp_2", "path_2"), Mount("mp_3", "path_3")] - assert expected == backend.rows_written_for("hive_metastore.test.mounts", "append") - - -def test_list_mounts_should_return_a_deduped_list_of_mount_without_encryption_type(): - client = MagicMock() - client.dbutils.fs.mounts.return_value = [ - MountInfo("mp_1", "path_1", "info_1"), - MountInfo("mp_2", "path_2", "info_2"), - MountInfo("mp_2", "path_2", "info_2"), - ] - - backend = MockBackend() - instance = Mounts(backend, client, "test") - - instance.inventorize_mounts() - - expected = [Mount("mp_1", "path_1"), Mount("mp_2", "path_2")] - assert expected == backend.rows_written_for("hive_metastore.test.mounts", "append")