databrickslabs · nfx · Apr 22, 2024 · Apr 17, 2024 · Apr 17, 2024 · Apr 18, 2024
@@ -8,6 +8,7 @@
 
 from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex, TableView
 from databricks.labs.ucx.hive_metastore.mapping import TableToMigrate
+from databricks.labs.ucx.source_code.base import CurrentSessionState
 from databricks.labs.ucx.source_code.queries import FromTable
 
 logger = logging.getLogger(__name__)
@@ -41,7 +42,7 @@ def _view_dependencies(self):
             yield TableView("hive_metastore", src_db, old_table.name)
 
     def sql_migrate_view(self, index: MigrationIndex) -> str:
-        from_table = FromTable(index, use_schema=self.src.database)
+        from_table = FromTable(index, CurrentSessionState(self.src.database))
         assert self.src.view_text is not None, 'Expected a view text'
         migrated_select = from_table.apply(self.src.view_text)
         statements = sqlglot.parse(migrated_select, read='databricks')

@@ -82,6 +82,25 @@ def name(self) -> str: ...
     def apply(self, code: str) -> str: ...
 
 
+# The default schema to use when the schema is not specified in a table reference
+# See: https://spark.apache.org/docs/3.0.0-preview/sql-ref-syntax-qry-select-usedb.html
+DEFAULT_SCHEMA = 'default'
+
+
+@dataclass
+class CurrentSessionState:
+    """
+    A data class that represents the current state of a session.
+
+    This class can be used to track various aspects of a session, such as the current schema.
+
+    Attributes:
+        schema (str): The current schema of the session. If not provided, it defaults to 'DEFAULT_SCHEMA'.
+    """
+
+    schema: str = DEFAULT_SCHEMA
+
+
 class SequentialLinter(Linter):
     def __init__(self, linters: list[Linter]):
         self._linters = linters

@@ -91,7 +91,7 @@ def name() -> str:
         return 'dbfs-query'
 
     def lint(self, code: str) -> Iterable[Advice]:
-        for statement in sqlglot.parse(code, dialect='databricks'):
+        for statement in sqlglot.parse(code, read='databricks'):
             if not statement:
                 continue
             for table in statement.find_all(Table):

@@ -1,7 +1,7 @@
 from databricks.sdk.service.workspace import Language
 
 from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
-from databricks.labs.ucx.source_code.base import Fixer, Linter, SequentialLinter
+from databricks.labs.ucx.source_code.base import Fixer, Linter, SequentialLinter, CurrentSessionState
 from databricks.labs.ucx.source_code.pyspark import SparkSql
 from databricks.labs.ucx.source_code.queries import FromTable
 from databricks.labs.ucx.source_code.dbfs import DBFSUsageLinter, FromDbfsFolder
@@ -11,7 +11,8 @@
 class Languages:
     def __init__(self, index: MigrationIndex):
         self._index = index
-        from_table = FromTable(index)
+        session_state = CurrentSessionState()
+        from_table = FromTable(index, session_state=session_state)
         dbfs_from_folder = FromDbfsFolder()
         self._linters = {
             Language.PYTHON: SequentialLinter(

@@ -243,6 +243,8 @@ def requires_isolated_pi(self) -> str:
 
     @classmethod
     def of_language(cls, language: Language) -> CellLanguage:
+        # TODO: Should this not raise a ValueError if the language is not found?
+        #  It also  causes a GeneratorExit exception to be raised. Maybe an explicit loop is better.
         return next((cl for cl in CellLanguage if cl.language == language))
 
     @classmethod

@@ -1,5 +1,6 @@
 from collections.abc import Iterable
 
+from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
 from databricks.labs.ucx.source_code.base import Advice
 from databricks.labs.ucx.source_code.notebook import Notebook
 from databricks.labs.ucx.source_code.languages import Languages, Language
@@ -16,7 +17,8 @@ def __init__(self, langs: Languages, notebook: Notebook):
         self._notebook: Notebook = notebook
 
     @classmethod
-    def from_source(cls, langs: Languages, source: str, default_language: Language) -> 'NotebookLinter':
+    def from_source(cls, index: MigrationIndex, source: str, default_language: Language) -> 'NotebookLinter':
+        langs = Languages(index)
         notebook = Notebook.parse("", source, default_language)
         assert notebook is not None
         return cls(langs, notebook)

@@ -79,7 +79,7 @@
     def lint(self, from_table: FromTable, index: MigrationIndex, node: ast.Call) -> Iterator[Advice]:
         table_arg = self._get_table_arg(node)
         if isinstance(table_arg, ast.Constant):
-            dst = self._find_dest(index, table_arg.value)
+            dst = self._find_dest(index, table_arg.value, from_table.schema)
             if dst is not None:
                 yield Deprecation(
                     code='table-migrate',
@@ -104,13 +104,16 @@
     def apply(self, from_table: FromTable, index: MigrationIndex, node: ast.Call) -> None:
         table_arg = self._get_table_arg(node)
         assert isinstance(table_arg, ast.Constant)
-        dst = self._find_dest(index, table_arg.value)
+        dst = self._find_dest(index, table_arg.value, from_table.schema)
         if dst is not None:
             table_arg.value = dst.destination()
 
     @staticmethod
-    def _find_dest(index: MigrationIndex, value: str):
+    def _find_dest(index: MigrationIndex, value: str, schema: str):
         parts = value.split(".")
+        # Ensure that unqualified table references use the current schema
+        if len(parts) == 1:
+            return index.get(schema, parts[0])
         return None if len(parts) != 2 else index.get(parts[0], parts[1])
 
 

@@ -8,7 +8,6 @@
 
 from databricks.labs.ucx.source_code.base import Linter, Advice, Advisory
 
-
 logger = logging.getLogger(__name__)
 
 

@@ -2,39 +2,71 @@
 
 import logging
 import sqlglot
-from sqlglot.expressions import Table, Expression
+from sqlglot.expressions import Table, Expression, Use
 from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
-from databricks.labs.ucx.source_code.base import Advice, Deprecation, Fixer, Linter
+from databricks.labs.ucx.source_code.base import Advice, Deprecation, Fixer, Linter, CurrentSessionState
 
 logger = logging.getLogger(__name__)
 
 
 class FromTable(Linter, Fixer):
-    def __init__(self, index: MigrationIndex, *, use_schema: str | None = None):
-        self._index = index
-        self._use_schema = use_schema
+    """Linter and Fixer for table migrations in SQL queries.
+
+    This class is responsible for identifying and fixing table migrations in
+    SQL queries.
+    """
+
+    def __init__(self, index: MigrationIndex, session_state: CurrentSessionState):
+        """
+        Initializes the FromTable class.
+
+        Args:
+            index: The migration index, which is a mapping of source tables to destination tables.
+            session_state: The current session state, which will be used to track the current schema.
+
+        We need to be careful with the nomenclature here. For instance when parsing a table reference,
+        sqlglot uses `db` instead of `schema` to refer to the schema. The following table references
+        show how sqlglot represents them:::
+
+                catalog.schema.table    -> Table(catalog='catalog', db='schema', this='table')
+                schema.table                 -> Table(catalog='', db='schema', this='table')
+                table                               -> Table(catalog='', db='', this='table')
+        """
+        self._index: MigrationIndex = index
+        self._session_state: CurrentSessionState = session_state if session_state else CurrentSessionState()
 
     def name(self) -> str:
         return 'table-migrate'
 
+    @property
+    def schema(self):
+        return self._session_state.schema
+
     def lint(self, code: str) -> Iterable[Advice]:
-        for statement in sqlglot.parse(code, dialect='databricks'):
+        for statement in sqlglot.parse(code, read='databricks'):
             if not statement:
                 continue
             for table in statement.find_all(Table):
-                catalog = self._catalog(table)
-                if catalog != 'hive_metastore':
+                if isinstance(statement, Use):
+                    # Sqlglot captures the database name in the Use statement as a Table, with
+                    # the schema  as the table name.
+                    self._session_state.schema = table.name
                     continue
-                src_db = table.db if table.db else self._use_schema
-                if not src_db:
+
+                # we only migrate tables in the hive_metastore catalog
+                if self._catalog(table) != 'hive_metastore':
+                    continue
+                # Sqlglot uses db instead of schema, watch out for that
+                src_schema = table.db if table.db else self._session_state.schema
+                if not src_schema:
                     logger.error(f"Could not determine schema for table {table.name}")
                     continue
-                dst = self._index.get(src_db, table.name)
+                dst = self._index.get(src_schema, table.name)
                 if not dst:
                     continue
                 yield Deprecation(
                     code='table-migrate',
-                    message=f"Table {table.db}.{table.name} is migrated to {dst.destination()} in Unity Catalog",
+                    message=f"Table {src_schema}.{table.name} is migrated to {dst.destination()} in Unity Catalog",
                     # SQLGlot does not propagate tokens yet. See https://github.com/tobymao/sqlglot/issues/3159
                     start_line=0,
                     start_col=0,
@@ -53,12 +85,17 @@ def apply(self, code: str) -> str:
         for statement in sqlglot.parse(code, read='databricks'):
             if not statement:
                 continue
+            if isinstance(statement, Use):
+                table = statement.this
+                self._session_state.schema = table.name
+                new_statements.append(statement.sql('databricks'))
+                continue
             for old_table in self._dependent_tables(statement):
-                src_db = old_table.db if old_table.db else self._use_schema
-                if not src_db:
+                src_schema = old_table.db if old_table.db else self._session_state.schema
+                if not src_schema:
                     logger.error(f"Could not determine schema for table {old_table.name}")
                     continue
-                dst = self._index.get(src_db, old_table.name)
+                dst = self._index.get(src_schema, old_table.name)
                 if not dst:
                     continue
                 new_table = Table(catalog=dst.dst_catalog, db=dst.dst_schema, this=dst.dst_table)

@@ -19,3 +19,24 @@ def migration_index():
             MigrationStatus('other', 'matters', dst_catalog='some', dst_schema='certain', dst_table='issues'),
         ]
     )
+
+
+@pytest.fixture
+def extended_test_index():
+    return MigrationIndex(
+        [
+            MigrationStatus('old', 'things', dst_catalog='brand', dst_schema='new', dst_table='stuff'),
+            MigrationStatus('other', 'matters', dst_catalog='some', dst_schema='certain', dst_table='issues'),
+            MigrationStatus('old', 'stuff', dst_catalog='brand', dst_schema='new', dst_table='things'),
+            MigrationStatus('other', 'issues', dst_catalog='some', dst_schema='certain', dst_table='matters'),
+            MigrationStatus('default', 'testtable', dst_catalog='cata', dst_schema='nondefault', dst_table='table'),
+            MigrationStatus('different_db', 'testtable', dst_catalog='cata2', dst_schema='newspace', dst_table='table'),
+            MigrationStatus('old', 'testtable', dst_catalog='cata3', dst_schema='newspace', dst_table='table'),
+            MigrationStatus('default', 'people', dst_catalog='cata4', dst_schema='nondefault', dst_table='newpeople'),
+            MigrationStatus(
+                'something', 'persons', dst_catalog='cata4', dst_schema='newsomething', dst_table='persons'
+            ),
+            MigrationStatus('whatever', 'kittens', dst_catalog='cata4', dst_schema='felines', dst_table='toms'),
+            MigrationStatus('whatever', 'numbers', dst_catalog='cata4', dst_schema='counting', dst_table='numbers'),
+        ]
+    )
@@ -40,7 +40,8 @@
 SQL_NOTEBOOK_SAMPLE = (
     "chf-pqi-scoring.sql.txt",
     Language.SQL,
-    ['md', 'sql', 'sql', 'md', 'sql', 'python', 'sql', 'sql', 'sql', 'md', 'sql', 'sql', 'md', 'sql', 'sql', 'md', 'sql'],
+    ['md', 'sql', 'sql', 'md', 'sql', 'python', 'sql', 'sql', 'sql', 'md', 'sql',
+     'sql', 'md', 'sql', 'sql', 'md', 'sql'],
 )
 SHELL_NOTEBOOK_SAMPLE = (
     "notebook-with-shell-cell.py.txt",