From 18950f19356a9bf4a5906c4b3f5452d2add4709d Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 30 Aug 2024 12:16:11 +0200 Subject: [PATCH 01/80] add support for sql functional tests --- tests/unit/source_code/test_functional.py | 73 +++++++++++++++-------- 1 file changed, 48 insertions(+), 25 deletions(-) diff --git a/tests/unit/source_code/test_functional.py b/tests/unit/source_code/test_functional.py index 3b6bbf9ffb..338ab99e8f 100644 --- a/tests/unit/source_code/test_functional.py +++ b/tests/unit/source_code/test_functional.py @@ -16,6 +16,7 @@ from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver from databricks.labs.ucx.source_code.linters.context import LinterContext from databricks.labs.ucx.source_code.linters.files import FileLoader +from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader from databricks.labs.ucx.source_code.notebooks.sources import FileLinter from databricks.labs.ucx.source_code.path_lookup import PathLookup @@ -62,11 +63,19 @@ def from_advice(cls, advice: Advice) -> Expectation: ) +_UCX_REGEX_SUFFIX = r" ucx\[(?P[\w-]+):(?P[\d+]+):(?P[\d]+):(?P[\d+]+):(?P[\d]+)] (?P.*)" +_STATE_REGEX_SUFFIX = r' ucx\[session-state] (?P\{.*})' + class Functional: - _re = re.compile( - r"# ucx\[(?P[\w-]+):(?P[\d+]+):(?P[\d]+):(?P[\d+]+):(?P[\d]+)] (?P.*)" - ) - _re_session_state = re.compile(r'# ucx\[session-state] (?P\{.*})') + + _ucx_regex = { + CellLanguage.PYTHON: re.compile(CellLanguage.PYTHON.comment_prefix + _UCX_REGEX_SUFFIX), + CellLanguage.SQL: re.compile(CellLanguage.SQL.comment_prefix + _UCX_REGEX_SUFFIX), + } + _session_states = { + CellLanguage.PYTHON: re.compile(CellLanguage.PYTHON.comment_prefix + _STATE_REGEX_SUFFIX), + CellLanguage.SQL: re.compile(CellLanguage.SQL.comment_prefix + _STATE_REGEX_SUFFIX), + } _location = Path(__file__).parent / 'samples/functional' @@ -95,10 +104,11 @@ def test_id(cls, sample: Functional) -> str: def __init__(self, path: Path, parent: Path | None = None) -> None: self.path = path self.parent = parent + self.language = CellLanguage.PYTHON if path.suffix.endswith("py") else CellLanguage.SQL - def verify(self, path_lookup: PathLookup, dependency_resolver: DependencyResolver) -> None: + def verify(self, path_lookup: PathLookup, dependency_resolver: DependencyResolver, migration_index: MigrationIndex) -> None: expected_problems = list(self._expected_problems()) - actual_advices = list(self._lint(path_lookup, dependency_resolver)) + actual_advices = list(self._lint(path_lookup, dependency_resolver, migration_index)) # Convert the actual problems to the same type as our expected problems for easier comparison. actual_problems = [Expectation.from_advice(advice) for advice in actual_advices] @@ -118,13 +128,7 @@ def verify(self, path_lookup: PathLookup, dependency_resolver: DependencyResolve assert no_errors, "\n".join(errors) # TODO: output annotated file with comments for quick fixing - def _lint(self, path_lookup: PathLookup, dependency_resolver: DependencyResolver) -> Iterable[Advice]: - migration_index = MigrationIndex( - [ - MigrationStatus('old', 'things', dst_catalog='brand', dst_schema='new', dst_table='stuff'), - MigrationStatus('other', 'matters', dst_catalog='some', dst_schema='certain', dst_table='issues'), - ] - ) + def _lint(self, path_lookup: PathLookup, dependency_resolver: DependencyResolver, migration_index: MigrationIndex) -> Iterable[Advice]: session_state = self._test_session_state() print(str(session_state)) session_state.named_parameters = {"my-widget": "my-path.py"} @@ -145,9 +149,10 @@ def _lint(self, path_lookup: PathLookup, dependency_resolver: DependencyResolver return linter.lint() def _regex_match(self, regex: re.Pattern[str]) -> Generator[tuple[Comment, dict[str, Any]], None, None]: + ucx_comment_prefix = self.language.comment_prefix + ' ucx[' with self.path.open('rb') as f: for comment in self._comments(f): - if not comment.text.startswith('# ucx['): + if not comment.text.startswith(ucx_comment_prefix): continue match = regex.match(comment.text) if not match: @@ -156,7 +161,8 @@ def _regex_match(self, regex: re.Pattern[str]) -> Generator[tuple[Comment, dict[ yield comment, groups def _expected_problems(self) -> Generator[Expectation, None, None]: - for comment, groups in self._regex_match(self._re): + regex = self._ucx_regex[self.language] + for comment, groups in self._regex_match(regex): reported_start_line = groups['start_line'] if '+' in reported_start_line: start_line = int(reported_start_line[1:]) + comment.start_line @@ -177,7 +183,8 @@ def _expected_problems(self) -> Generator[Expectation, None, None]: ) def _test_session_state(self) -> CurrentSessionState: - matches = list(self._regex_match(self._re_session_state)) + regex = self._session_states[self.language] + matches = list(self._regex_match(regex)) if len(matches) > 1: raise ValueError("A test should have no more than one session state definition") if len(matches) == 0: @@ -186,18 +193,34 @@ def _test_session_state(self) -> CurrentSessionState: json_str = groups['session_state_json'] return CurrentSessionState.from_json(json.loads(json_str)) + def _comments(self, f) -> Generator[Comment, None, None]: + if self.language is CellLanguage.PYTHON: + yield from self._python_comments(f) + return + if self.language is CellLanguage.SQL: + yield from self._sql_comments(f) + @staticmethod - def _comments(f) -> Generator[Comment, None, None]: + def _python_comments(f) -> Generator[Comment, None, None]: for token in tokenize.tokenize(f.readline): if token.type != tokenize.COMMENT: continue yield Comment.from_token(token) + @staticmethod + def _sql_comments(f) -> Generator[Comment, None, None]: + # SQLGlot does not propagate tokens. See https://github.com/tobymao/sqlglot/issues/3159 + # Hence SQL statement advice offsets can be wrong because of multi-line comments and statements + for idx, line in enumerate(f.readlines()): + if not line.startswith(b"--"): + continue + yield Comment(text=line.decode("utf-8"), start_line=idx, end_line=idx) + @pytest.mark.parametrize("sample", Functional.all(), ids=Functional.test_id) -def test_functional(sample: Functional, mock_path_lookup, simple_dependency_resolver) -> None: +def test_functional(sample: Functional, mock_path_lookup, simple_dependency_resolver, extended_test_index) -> None: path_lookup = mock_path_lookup.change_directory(sample.path.parent) - sample.verify(path_lookup, simple_dependency_resolver) + sample.verify(path_lookup, simple_dependency_resolver, extended_test_index) @pytest.mark.parametrize( @@ -211,15 +234,15 @@ def test_functional(sample: Functional, mock_path_lookup, simple_dependency_reso ("_child_that_uses_value_from_parent.py", "grand_parent_that_imports_parent_that_magic_runs_child.py"), ], ) -def test_functional_with_parent(child: str, parent: str, mock_path_lookup, simple_dependency_resolver) -> None: +def test_functional_with_parent(child: str, parent: str, mock_path_lookup, simple_dependency_resolver, extended_test_index) -> None: sample = Functional.for_child(child, parent) path_lookup = mock_path_lookup.change_directory(sample.path.parent) - sample.verify(path_lookup, simple_dependency_resolver) + sample.verify(path_lookup, simple_dependency_resolver, extended_test_index) -@pytest.mark.skip(reason="Used for troubleshooting failing tests") -def test_one_functional(mock_path_lookup, simple_dependency_resolver): - path = mock_path_lookup.resolve(Path("functional/widgets.py")) +# @pytest.mark.skip(reason="Used for troubleshooting failing tests") +def test_one_functional(mock_path_lookup, simple_dependency_resolver, extended_test_index): + path = mock_path_lookup.resolve(Path("functional/table-migration/table-migration-notebook.sql")) path_lookup = mock_path_lookup.change_directory(path.parent) sample = Functional(path) - sample.verify(path_lookup, simple_dependency_resolver) + sample.verify(path_lookup, simple_dependency_resolver, extended_test_index) From 224796dbda6824f32049bb4b2f98c439cba33c3b Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 30 Aug 2024 12:16:34 +0200 Subject: [PATCH 02/80] disable --- tests/unit/source_code/test_functional.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/source_code/test_functional.py b/tests/unit/source_code/test_functional.py index 338ab99e8f..8b21891364 100644 --- a/tests/unit/source_code/test_functional.py +++ b/tests/unit/source_code/test_functional.py @@ -240,7 +240,7 @@ def test_functional_with_parent(child: str, parent: str, mock_path_lookup, simpl sample.verify(path_lookup, simple_dependency_resolver, extended_test_index) -# @pytest.mark.skip(reason="Used for troubleshooting failing tests") +@pytest.mark.skip(reason="Used for troubleshooting failing tests") def test_one_functional(mock_path_lookup, simple_dependency_resolver, extended_test_index): path = mock_path_lookup.resolve(Path("functional/table-migration/table-migration-notebook.sql")) path_lookup = mock_path_lookup.change_directory(path.parent) From b63cb19178b5142bdb6e3626029cba3a9a49c2e0 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 30 Aug 2024 12:16:55 +0200 Subject: [PATCH 03/80] more functional tests --- .../file-access/complex-sql-notebook.sql | 60 +++++++++++++++++ .../python-notebook-with-embedded-sql.py | 31 +++++++++ .../sql-notebook-with-embedded-python.sql | 18 +++++ .../table-migration-notebook.py | 35 ++++++++++ .../table-migration-notebook.sql | 67 +++++++++++++++++++ 5 files changed, 211 insertions(+) create mode 100644 tests/unit/source_code/samples/functional/file-access/complex-sql-notebook.sql create mode 100644 tests/unit/source_code/samples/functional/file-access/python-notebook-with-embedded-sql.py create mode 100644 tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql create mode 100644 tests/unit/source_code/samples/functional/table-migration/table-migration-notebook.py create mode 100644 tests/unit/source_code/samples/functional/table-migration/table-migration-notebook.sql diff --git a/tests/unit/source_code/samples/functional/file-access/complex-sql-notebook.sql b/tests/unit/source_code/samples/functional/file-access/complex-sql-notebook.sql new file mode 100644 index 0000000000..d2ecb3cafd --- /dev/null +++ b/tests/unit/source_code/samples/functional/file-access/complex-sql-notebook.sql @@ -0,0 +1,60 @@ +-- Databricks notebook source +-- MAGIC %md +-- MAGIC #Test notebook for DBFS discovery in Notebooks + +-- COMMAND ---------- +-- DBTITLE 1,A Python cell that references DBFS +-- MAGIC %python +-- ucx[dbfs-usage:+1:7:+1:18] Deprecated file system path: dbfs:/... +-- MAGIC DBFS = "dbfs:/..." +-- ucx[dbfs-usage:+1:7:+1:18] Deprecated file system path: /dbfs/mnt +-- MAGIC DBFS = "/dbfs/mnt" +-- ucx[dbfs-usage:+1:7:+1:14] Deprecated file system path: /mnt/ +-- MAGIC DBFS = "/mnt/" +-- ucx[dbfs-usage:+1:7:+1:18] Deprecated file system path: dbfs:/... +-- MAGIC DBFS = "dbfs:/..." +-- ucx[dbfs-usage:+1:10:+1:26] Deprecated file system path: /dbfs/mnt/data +-- MAGIC load_data('/dbfs/mnt/data') +-- MAGIC load_data('/data') +-- ucx[dbfs-usage:+1:10:+1:26] Deprecated file system path: /dbfs/mnt/data +-- MAGIC load_data('/dbfs/mnt/data', '/data') +-- MAGIC # load_data('/dbfs/mnt/data', '/data') +-- ucx[implicit-dbfs-usage:+2:0:+2:34] The use of default dbfs: references is deprecated: /mnt/foo/bar +-- ucx[dbfs-usage:+1:19:+1:33] Deprecated file system path: /mnt/foo/bar +-- MAGIC spark.read.parquet("/mnt/foo/bar") +-- ucx[direct-filesystem-access:+2:0:+2:39] The use of direct filesystem references is deprecated: dbfs:/mnt/foo/bar +-- ucx[dbfs-usage:+1:19:+1:38] Deprecated file system path: dbfs:/mnt/foo/bar +-- MAGIC spark.read.parquet("dbfs:/mnt/foo/bar") +-- ucx[direct-filesystem-access:+2:0:+2:40] The use of direct filesystem references is deprecated: dbfs://mnt/foo/bar +-- ucx[dbfs-usage:+1:19:+1:39] Deprecated file system path: dbfs://mnt/foo/bar +-- MAGIC spark.read.parquet("dbfs://mnt/foo/bar") +-- MAGIC # Would need a stateful linter to detect this next one +-- MAGIC spark.read.parquet(DBFS) + +-- COMMAND ---------- +-- ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: dbfs:/... +-- DBTITLE 1,A SQL cell that references DBFS + +SELECT * FROM parquet.`dbfs:/...` LIMIT 10 + +-- COMMAND ---------- +-- ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: /mnt/... +-- DBTITLE 1,A SQL cell that references DBFS +SELECT * FROM delta.`/mnt/...` WHERE foo > 6 + +-- COMMAND ---------- +-- ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: /a/b/c +-- DBTITLE 1,A SQL cell that references DBFS + SELECT * FROM json.`/a/b/c` WHERE foo > 6 + +-- COMMAND ---------- +-- ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: /... +-- DBTITLE 1,A SQL cell that references DBFS + DELETE FROM json.`/...` WHERE foo = 'bar' + +-- COMMAND ---------- +-- ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: /dbfs/... +-- DBTITLE 1,A SQL cell that references DBFS + +MERGE INTO delta.`/dbfs/...` t USING source ON t.key = source.key WHEN MATCHED THEN DELETE + diff --git a/tests/unit/source_code/samples/functional/file-access/python-notebook-with-embedded-sql.py b/tests/unit/source_code/samples/functional/file-access/python-notebook-with-embedded-sql.py new file mode 100644 index 0000000000..e0b7504e09 --- /dev/null +++ b/tests/unit/source_code/samples/functional/file-access/python-notebook-with-embedded-sql.py @@ -0,0 +1,31 @@ +# Databricks notebook source +# MAGIC %md # This is a Python notebook, that has SQL cell embedded + +# COMMAND ---------- + +# ucx[implicit-dbfs-usage:+2:8:+2:43] The use of default dbfs: references is deprecated: /mnt/things/e/f/g +# ucx[dbfs-usage:+1:23:+1:42] Deprecated file system path: /mnt/things/e/f/g +display(spark.read.csv('/mnt/things/e/f/g')) + +# COMMAND ---------- + +# ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: dbfs:/mnt/foo +# MAGIC %sql SELECT * FROM csv.`dbfs:/mnt/foo` + +# COMMAND ---------- + +# MAGIC %md mess around with formatting + + + + +# COMMAND ---------- + + +# ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: dbfs:/mnt/bar/e/f/g +# MAGIC %sql +# MAGIC SELECT * FROM +# MAGIC csv.`dbfs:/mnt/bar/e/f/g` +# MAGIC WHERE _c1 > 5 + + diff --git a/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql b/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql new file mode 100644 index 0000000000..d6bf93b291 --- /dev/null +++ b/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql @@ -0,0 +1,18 @@ +-- Databricks notebook source +-- MAGIC %md # This is a SQL notebook, that has Python cell embedded + +-- COMMAND ---------- + +-- ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: dbfs:/mnt/whatever +SELECT * FROM csv.`dbfs:/mnt/whatever` + + + + + +-- COMMAND ---------- + +-- MAGIC %python +-- ucx[implicit-dbfs-usage:+2:8:+2:43] The use of default dbfs: references is deprecated: /mnt/things/e/f/g +-- ucx[dbfs-usage:+1:23:+1:42] Deprecated file system path: /mnt/things/e/f/g +-- MAGIC display(spark.read.csv('/mnt/things/e/f/g')) diff --git a/tests/unit/source_code/samples/functional/table-migration/table-migration-notebook.py b/tests/unit/source_code/samples/functional/table-migration/table-migration-notebook.py new file mode 100644 index 0000000000..22c8325035 --- /dev/null +++ b/tests/unit/source_code/samples/functional/table-migration/table-migration-notebook.py @@ -0,0 +1,35 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC #Test notebook for Use tracking in Notebooks + +# COMMAND ---------- + +# ucx[table-migrated-to-uc:+2:8:+2:29] Table people is migrated to cata4.nondefault.newpeople in Unity Catalog +# ucx[default-format-changed-in-dbr8:+1:8:+1:29] The default format changed in Databricks Runtime 8.0, from Parquet to Delta +display(spark.table('people')) # we are looking at default.people table + +# COMMAND ---------- + +# MAGIC %sql USE something + +# COMMAND ---------- + +# ucx[table-migrated-to-uc:+2:8:+2:30] Table persons is migrated to cata4.newsomething.persons in Unity Catalog +# ucx[default-format-changed-in-dbr8:+1:8:+1:30] The default format changed in Databricks Runtime 8.0, from Parquet to Delta +display(spark.table('persons')) # we are looking at something.persons table + +# COMMAND ---------- + +spark.sql('USE whatever') + +# COMMAND ---------- + +# ucx[table-migrated-to-uc:+2:8:+2:30] Table kittens is migrated to cata4.felines.toms in Unity Catalog +# ucx[default-format-changed-in-dbr8:+1:8:+1:30] The default format changed in Databricks Runtime 8.0, from Parquet to Delta +display(spark.table('kittens')) # we are looking at whatever.kittens table + +# COMMAND ---------- + +# ucx[table-migrated-to-uc:+2:0:+2:38] Table numbers is migrated to cata4.counting.numbers in Unity Catalog +# ucx[default-format-changed-in-dbr8:+1:0:+1:38] The default format changed in Databricks Runtime 8.0, from Parquet to Delta +spark.range(10).saveAsTable('numbers') # we are saving to whatever.numbers table. diff --git a/tests/unit/source_code/samples/functional/table-migration/table-migration-notebook.sql b/tests/unit/source_code/samples/functional/table-migration/table-migration-notebook.sql new file mode 100644 index 0000000000..d2c6acb4e8 --- /dev/null +++ b/tests/unit/source_code/samples/functional/table-migration/table-migration-notebook.sql @@ -0,0 +1,67 @@ +-- Databricks notebook source +-- MAGIC %md +-- MAGIC #Test notebook for Use tracking in Notebooks + +-- COMMAND ---------- +-- DBTITLE 1,A SQL cell that changes the DB + +USE different_db + +-- COMMAND ---------- +-- ucx[table-migrated-to-uc:+0:0:+0:1024] Table different_db.testtable is migrated to cata2.newspace.table in Unity Catalog +-- DBTITLE 1,A SQL cell that references tables + +SELECT * FROM testtable LIMIT 10 + +-- COMMAND ---------- +-- DBTITLE 1,A SQL cell that changes the DB to one we migrate from + +USE old + +-- COMMAND ---------- +-- ucx[table-migrated-to-uc:+0:0:+0:1024] Table old.testtable is migrated to cata3.newspace.table in Unity Catalog +-- DBTITLE 1,A SQL cell that references tables + +SELECT * FROM testtable LIMIT 10 + +-- COMMAND ---------- +-- ucx[table-migrated-to-uc:+0:0:+0:1024] Table old.stuff is migrated to brand.new.things in Unity Catalog +-- DBTITLE 1,A SQL cell that references tables + +SELECT * FROM stuff LIMIT 10 + +-- COMMAND ---------- +-- DBTITLE 1,A Python cell that uses calls to change the USE +-- MAGIC %python +-- MAGIC # This is a Python cell that uses calls to change the USE... + +spark.sql("use different_db") + +-- COMMAND ---------- +-- ucx[table-migrated-to-uc:+0:0:+0:1024] Table different_db.testtable is migrated to cata2.newspace.table in Unity Catalog +-- DBTITLE 1,A SQL cell that references DBFS + +SELECT * FROM testtable LIMIT 10 + +-- COMMAND ---------- +-- ucx[table-migrated-to-uc:+0:0:+0:1024] Table old.testtable is migrated to cata3.newspace.table in Unity Catalog +-- DBTITLE 1,A SQL cell that references DBFS + +SELECT * FROM old.testtable LIMIT 10 + +-- COMMAND ---------- +-- DBTITLE 1,A SQL cell that changes the DB to the default + +USE default + +-- COMMAND ---------- +-- ucx[table-migrated-to-uc:+0:0:+0:1024] Table default.testtable is migrated to cata.nondefault.table in Unity Catalog +-- DBTITLE 1,A SQL cell that references DBFS + +SELECT * FROM testtable LIMIT 10 + +-- COMMAND ---------- +-- DBTITLE 1,A SQL cell that references tables + +MERGE INTO catalog.schema.testtable t USING source ON t.key = source.key WHEN MATCHED THEN DELETE + From 6d911ca4c1cca79fe1653e97e32e50fbdaa41aae Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 30 Aug 2024 12:17:29 +0200 Subject: [PATCH 04/80] move test to functional --- .../unit/source_code/test_notebook_linter.py | 534 ------------------ 1 file changed, 534 deletions(-) diff --git a/tests/unit/source_code/test_notebook_linter.py b/tests/unit/source_code/test_notebook_linter.py index a371a56ea5..17c00feee6 100644 --- a/tests/unit/source_code/test_notebook_linter.py +++ b/tests/unit/source_code/test_notebook_linter.py @@ -8,542 +8,8 @@ index = MigrationIndex([]) -@pytest.mark.parametrize( - "lang, source, expected", - [ - # 2 alerts - ( - Language.SQL, - """-- Databricks notebook source --- MAGIC %md # This is a SQL notebook, that has Python cell embedded - --- COMMAND ---------- - -SELECT * FROM csv.`dbfs:/mnt/whatever` - - - - - --- COMMAND ---------- - --- MAGIC %python --- MAGIC display(spark.read.csv('/mnt/things/e/f/g')) -""", - [ - Deprecation( - code='dbfs-read-from-sql-query', - message='The use of DBFS is deprecated: dbfs:/mnt/whatever', - start_line=5, - start_col=0, - end_line=5, - end_col=1024, - ), - Deprecation( - code='implicit-dbfs-usage', - message='The use of default dbfs: references is deprecated: /mnt/things/e/f/g', - start_line=14, - start_col=8, - end_line=14, - end_col=43, - ), - Deprecation( - code='dbfs-usage', - message='Deprecated file system path: /mnt/things/e/f/g', - start_line=14, - start_col=23, - end_line=14, - end_col=42, - ), - ], - ), - ( - Language.PYTHON, - # 3 alerts - """# Databricks notebook source -# MAGIC %md # This is a Python notebook, that has SQL cell embedded - -# COMMAND ---------- - -display(spark.read.csv('/mnt/things/e/f/g')) - -# COMMAND ---------- - -# MAGIC %sql SELECT * FROM csv.`dbfs:/mnt/foo` - -# COMMAND ---------- - -# MAGIC %md mess around with formatting - - - - -# COMMAND ---------- - - -# MAGIC %sql -# MAGIC SELECT * FROM -# MAGIC csv.`dbfs:/mnt/bar/e/f/g` -# MAGIC WHERE _c1 > 5 - - - -""", - [ - Deprecation( - code='implicit-dbfs-usage', - message='The use of default dbfs: references is deprecated: ' '/mnt/things/e/f/g', - start_line=5, - start_col=8, - end_line=5, - end_col=43, - ), - Deprecation( - code='dbfs-usage', - message='Deprecated file system path: /mnt/things/e/f/g', - start_line=5, - start_col=23, - end_line=5, - end_col=42, - ), - Deprecation( - code='dbfs-read-from-sql-query', - message='The use of DBFS is deprecated: dbfs:/mnt/foo', - start_line=9, - start_col=0, - end_line=9, - end_col=1024, - ), - Deprecation( - code='dbfs-read-from-sql-query', - message='The use of DBFS is deprecated: dbfs:/mnt/bar/e/f/g', - start_line=21, - start_col=0, - end_line=21, - end_col=1024, - ), - ], - ), - ( - Language.SQL, - """-- Databricks notebook source --- MAGIC %md --- MAGIC #Test notebook for DBFS discovery in Notebooks - --- COMMAND ---------- --- DBTITLE 1,A Python cell that references DBFS --- MAGIC %python --- MAGIC DBFS = "dbfs:/..." --- MAGIC DBFS = "/dbfs/mnt" --- MAGIC DBFS = "/mnt/" --- MAGIC DBFS = "dbfs:/..." --- MAGIC load_data('/dbfs/mnt/data') --- MAGIC load_data('/data') --- MAGIC load_data('/dbfs/mnt/data', '/data') --- MAGIC # load_data('/dbfs/mnt/data', '/data') --- MAGIC spark.read.parquet("/mnt/foo/bar") --- MAGIC spark.read.parquet("dbfs:/mnt/foo/bar") --- MAGIC spark.read.parquet("dbfs://mnt/foo/bar") --- MAGIC # Would need a stateful linter to detect this next one --- MAGIC spark.read.parquet(DBFS) - --- COMMAND ---------- --- DBTITLE 1,A SQL cell that references DBFS - -SELECT * FROM parquet.`dbfs:/...` LIMIT 10 - --- COMMAND ---------- --- DBTITLE 1,A SQL cell that references DBFS -SELECT * FROM delta.`/mnt/...` WHERE foo > 6 - --- COMMAND ---------- --- DBTITLE 1,A SQL cell that references DBFS - SELECT * FROM json.`/a/b/c` WHERE foo > 6 - --- COMMAND ---------- --- DBTITLE 1,A SQL cell that references DBFS - DELETE FROM json.`/...` WHERE foo = 'bar' - --- COMMAND ---------- --- DBTITLE 1,A SQL cell that references DBFS - -MERGE INTO delta.`/dbfs/...` t USING source ON t.key = source.key WHEN MATCHED THEN DELETE - """, - [ - Deprecation( - code='implicit-dbfs-usage', - message='The use of default dbfs: references is deprecated: /mnt/foo/bar', - start_line=15, - start_col=0, - end_line=15, - end_col=34, - ), - Deprecation( - code='direct-filesystem-access', - message='The use of direct filesystem references is deprecated: dbfs:/mnt/foo/bar', - start_line=16, - start_col=0, - end_line=16, - end_col=39, - ), - Deprecation( - code='direct-filesystem-access', - message='The use of direct filesystem references is deprecated: dbfs://mnt/foo/bar', - start_line=17, - start_col=0, - end_line=17, - end_col=40, - ), - Deprecation( - code='dbfs-usage', - message='Deprecated file system path: dbfs:/...', - start_line=7, - start_col=7, - end_line=7, - end_col=18, - ), - Deprecation( - code='dbfs-usage', - message='Deprecated file system path: /dbfs/mnt', - start_line=8, - start_col=7, - end_line=8, - end_col=18, - ), - Deprecation( - code='dbfs-usage', - message='Deprecated file system path: /mnt/', - start_line=9, - start_col=7, - end_line=9, - end_col=14, - ), - Deprecation( - code='dbfs-usage', - message='Deprecated file system path: dbfs:/...', - start_line=10, - start_col=7, - end_line=10, - end_col=18, - ), - Deprecation( - code='dbfs-usage', - message='Deprecated file system path: /dbfs/mnt/data', - start_line=11, - start_col=10, - end_line=11, - end_col=26, - ), - Deprecation( - code='dbfs-usage', - message='Deprecated file system path: /dbfs/mnt/data', - start_line=13, - start_col=10, - end_line=13, - end_col=26, - ), - Deprecation( - code='dbfs-usage', - message='Deprecated file system path: /mnt/foo/bar', - start_line=15, - start_col=19, - end_line=15, - end_col=33, - ), - Deprecation( - code='dbfs-usage', - message='Deprecated file system path: dbfs:/mnt/foo/bar', - start_line=16, - start_col=19, - end_line=16, - end_col=38, - ), - Deprecation( - code='dbfs-usage', - message='Deprecated file system path: dbfs://mnt/foo/bar', - start_line=17, - start_col=19, - end_line=17, - end_col=39, - ), - Deprecation( - code='dbfs-read-from-sql-query', - message='The use of DBFS is deprecated: dbfs:/...', - start_line=22, - start_col=0, - end_line=22, - end_col=1024, - ), - Deprecation( - code='dbfs-read-from-sql-query', - message='The use of DBFS is deprecated: /mnt/...', - start_line=27, - start_col=0, - end_line=27, - end_col=1024, - ), - Deprecation( - code='dbfs-read-from-sql-query', - message='The use of DBFS is deprecated: /a/b/c', - start_line=31, - start_col=0, - end_line=31, - end_col=1024, - ), - Deprecation( - code='dbfs-read-from-sql-query', - message='The use of DBFS is deprecated: /...', - start_line=35, - start_col=0, - end_line=35, - end_col=1024, - ), - Deprecation( - code='dbfs-read-from-sql-query', - message='The use of DBFS is deprecated: /dbfs/...', - start_line=39, - start_col=0, - end_line=39, - end_col=1024, - ), - ], - ), - # Add more test cases here - ], -) -def test_notebook_linter(lang, source, expected, mock_path_lookup): - # SQLGlot does not propagate tokens yet. See https://github.com/tobymao/sqlglot/issues/3159 - # Hence SQL statement advice offsets can be wrong because of comments and statements - # over multiple lines. - linter = NotebookLinter.from_source(index, mock_path_lookup, CurrentSessionState(), source, lang) - assert linter is not None - gathered = list(linter.lint()) - assert gathered == expected - - def test_notebook_linter_name(mock_path_lookup): source = """-- Databricks notebook source""" linter = NotebookLinter.from_source(index, mock_path_lookup, CurrentSessionState(), source, Language.SQL) assert linter.name() == "notebook-linter" - -@pytest.mark.parametrize( - "lang, source, expected", - [ - ( - Language.SQL, - """-- Databricks notebook source --- MAGIC %md --- MAGIC #Test notebook for Use tracking in Notebooks - --- COMMAND ---------- --- DBTITLE 1,A SQL cell that changes the DB - -USE different_db - --- COMMAND ---------- --- DBTITLE 1,A SQL cell that references tables - -SELECT * FROM testtable LIMIT 10 - --- COMMAND ---------- --- DBTITLE 1,A SQL cell that changes the DB to one we migrate from - -USE old - --- COMMAND ---------- --- DBTITLE 1,A SQL cell that references tables - -SELECT * FROM testtable LIMIT 10 - --- COMMAND ---------- --- DBTITLE 1,A SQL cell that references tables - -SELECT * FROM stuff LIMIT 10 - --- COMMAND ---------- --- DBTITLE 1,A Python cell that uses calls to change the USE --- MAGIC %python --- MAGIC # This is a Python cell that uses calls to change the USE... - -spark.sql("use different_db") - --- COMMAND ---------- --- DBTITLE 1,A SQL cell that references DBFS - -SELECT * FROM testtable LIMIT 10 - --- COMMAND ---------- --- DBTITLE 1,A SQL cell that references DBFS - -SELECT * FROM old.testtable LIMIT 10 - --- COMMAND ---------- --- DBTITLE 1,A SQL cell that changes the DB to the default - -USE default - --- COMMAND ---------- --- DBTITLE 1,A SQL cell that references DBFS - -SELECT * FROM testtable LIMIT 10 - --- COMMAND ---------- --- DBTITLE 1,A SQL cell that references tables - -MERGE INTO catalog.schema.testtable t USING source ON t.key = source.key WHEN MATCHED THEN DELETE - """, - [ - Deprecation( - code='table-migrated-to-uc', - message='Table different_db.testtable is migrated to cata2.newspace.table in Unity Catalog', - start_line=10, - start_col=0, - end_line=10, - end_col=1024, - ), - Deprecation( - code='table-migrated-to-uc', - message='Table old.testtable is migrated to cata3.newspace.table in Unity Catalog', - start_line=20, - start_col=0, - end_line=20, - end_col=1024, - ), - Deprecation( - code='table-migrated-to-uc', - message='Table old.stuff is migrated to brand.new.things in Unity Catalog', - start_line=25, - start_col=0, - end_line=25, - end_col=1024, - ), - Deprecation( - code='table-migrated-to-uc', - message='Table different_db.testtable is migrated to ' 'cata2.newspace.table in Unity Catalog', - start_line=37, - start_col=0, - end_line=37, - end_col=1024, - ), - Deprecation( - code='table-migrated-to-uc', - message='Table old.testtable is migrated to cata3.newspace.table in Unity Catalog', - start_line=42, - start_col=0, - end_line=42, - end_col=1024, - ), - Deprecation( - code='table-migrated-to-uc', - message='Table default.testtable is migrated to cata.nondefault.table in Unity Catalog', - start_line=52, - start_col=0, - end_line=52, - end_col=1024, - ), - ], - ), - ( - Language.PYTHON, - """# Databricks notebook source ---- MAGIC %md --- MAGIC #Test notebook for Use tracking in Notebooks - -# COMMAND ---------- - -display(spark.table('people')) # we are looking at default.people table - -# COMMAND ---------- - -# MAGIC %sql USE something - -# COMMAND ---------- - -display(spark.table('persons')) # we are looking at something.persons table - -# COMMAND ---------- - -spark.sql('USE whatever') - -# COMMAND ---------- - -display(spark.table('kittens')) # we are looking at whatever.kittens table - -# COMMAND ---------- - -spark.range(10).saveAsTable('numbers') # we are saving to whatever.numbers table.""", - [ - Deprecation( - code='table-migrated-to-uc', - message='Table people is migrated to cata4.nondefault.newpeople in Unity Catalog', - start_line=6, - start_col=8, - end_line=6, - end_col=29, - ), - Advice( - code='default-format-changed-in-dbr8', - message='The default format changed in Databricks Runtime 8.0, from Parquet to Delta', - start_line=6, - start_col=8, - end_line=6, - end_col=29, - ), - Deprecation( - code='table-migrated-to-uc', - message='Table persons is migrated to cata4.newsomething.persons in Unity Catalog', - start_line=14, - start_col=8, - end_line=14, - end_col=30, - ), - Advice( - code='default-format-changed-in-dbr8', - message='The default format changed in Databricks Runtime 8.0, from Parquet to Delta', - start_line=14, - start_col=8, - end_line=14, - end_col=30, - ), - Deprecation( - code='table-migrated-to-uc', - message='Table kittens is migrated to cata4.felines.toms in Unity Catalog', - start_line=22, - start_col=8, - end_line=22, - end_col=30, - ), - Advice( - code='default-format-changed-in-dbr8', - message='The default format changed in Databricks Runtime 8.0, from Parquet to Delta', - start_line=22, - start_col=8, - end_line=22, - end_col=30, - ), - Deprecation( - code='table-migrated-to-uc', - message='Table numbers is migrated to cata4.counting.numbers in Unity Catalog', - start_line=26, - start_col=0, - end_line=26, - end_col=38, - ), - Advice( - code='default-format-changed-in-dbr8', - message='The default format changed in Databricks Runtime 8.0, from Parquet to Delta', - start_line=26, - start_col=0, - end_line=26, - end_col=38, - ), - ], - ), - ], -) -def test_notebook_linter_tracks_use(extended_test_index, lang, source, expected, mock_path_lookup): - linter = NotebookLinter.from_source(extended_test_index, mock_path_lookup, CurrentSessionState(), source, lang) - assert linter is not None - advices = list(linter.lint()) - assert advices == expected From 5f07dc83665fffa64fbae40f773958071cacf97f Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 30 Aug 2024 12:23:21 +0200 Subject: [PATCH 05/80] formatting --- tests/unit/source_code/test_functional.py | 15 +++++++++++---- tests/unit/source_code/test_notebook_linter.py | 4 +--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/tests/unit/source_code/test_functional.py b/tests/unit/source_code/test_functional.py index 8b21891364..ed2f6fe6dc 100644 --- a/tests/unit/source_code/test_functional.py +++ b/tests/unit/source_code/test_functional.py @@ -11,7 +11,7 @@ import pytest -from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex, MigrationStatus +from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex from databricks.labs.ucx.source_code.base import Advice, CurrentSessionState, is_a_notebook from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver from databricks.labs.ucx.source_code.linters.context import LinterContext @@ -66,6 +66,7 @@ def from_advice(cls, advice: Advice) -> Expectation: _UCX_REGEX_SUFFIX = r" ucx\[(?P[\w-]+):(?P[\d+]+):(?P[\d]+):(?P[\d+]+):(?P[\d]+)] (?P.*)" _STATE_REGEX_SUFFIX = r' ucx\[session-state] (?P\{.*})' + class Functional: _ucx_regex = { @@ -106,7 +107,9 @@ def __init__(self, path: Path, parent: Path | None = None) -> None: self.parent = parent self.language = CellLanguage.PYTHON if path.suffix.endswith("py") else CellLanguage.SQL - def verify(self, path_lookup: PathLookup, dependency_resolver: DependencyResolver, migration_index: MigrationIndex) -> None: + def verify( + self, path_lookup: PathLookup, dependency_resolver: DependencyResolver, migration_index: MigrationIndex + ) -> None: expected_problems = list(self._expected_problems()) actual_advices = list(self._lint(path_lookup, dependency_resolver, migration_index)) # Convert the actual problems to the same type as our expected problems for easier comparison. @@ -128,7 +131,9 @@ def verify(self, path_lookup: PathLookup, dependency_resolver: DependencyResolve assert no_errors, "\n".join(errors) # TODO: output annotated file with comments for quick fixing - def _lint(self, path_lookup: PathLookup, dependency_resolver: DependencyResolver, migration_index: MigrationIndex) -> Iterable[Advice]: + def _lint( + self, path_lookup: PathLookup, dependency_resolver: DependencyResolver, migration_index: MigrationIndex + ) -> Iterable[Advice]: session_state = self._test_session_state() print(str(session_state)) session_state.named_parameters = {"my-widget": "my-path.py"} @@ -234,7 +239,9 @@ def test_functional(sample: Functional, mock_path_lookup, simple_dependency_reso ("_child_that_uses_value_from_parent.py", "grand_parent_that_imports_parent_that_magic_runs_child.py"), ], ) -def test_functional_with_parent(child: str, parent: str, mock_path_lookup, simple_dependency_resolver, extended_test_index) -> None: +def test_functional_with_parent( + child: str, parent: str, mock_path_lookup, simple_dependency_resolver, extended_test_index +) -> None: sample = Functional.for_child(child, parent) path_lookup = mock_path_lookup.change_directory(sample.path.parent) sample.verify(path_lookup, simple_dependency_resolver, extended_test_index) diff --git a/tests/unit/source_code/test_notebook_linter.py b/tests/unit/source_code/test_notebook_linter.py index 17c00feee6..f4a7785317 100644 --- a/tests/unit/source_code/test_notebook_linter.py +++ b/tests/unit/source_code/test_notebook_linter.py @@ -1,8 +1,7 @@ -import pytest from databricks.sdk.service.workspace import Language from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex -from databricks.labs.ucx.source_code.base import Deprecation, Advice, CurrentSessionState +from databricks.labs.ucx.source_code.base import CurrentSessionState from databricks.labs.ucx.source_code.notebooks.sources import NotebookLinter index = MigrationIndex([]) @@ -12,4 +11,3 @@ def test_notebook_linter_name(mock_path_lookup): source = """-- Databricks notebook source""" linter = NotebookLinter.from_source(index, mock_path_lookup, CurrentSessionState(), source, Language.SQL) assert linter.name() == "notebook-linter" - From a8f3ecdb13d0e18d2d8333e31ef2533e5497f7c7 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 30 Aug 2024 12:52:33 +0200 Subject: [PATCH 06/80] formatting --- .../functional/file-access/sql-notebook-with-embedded-python.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql b/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql index d6bf93b291..2a9361fad5 100644 --- a/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql +++ b/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql @@ -16,3 +16,4 @@ SELECT * FROM csv.`dbfs:/mnt/whatever` -- ucx[implicit-dbfs-usage:+2:8:+2:43] The use of default dbfs: references is deprecated: /mnt/things/e/f/g -- ucx[dbfs-usage:+1:23:+1:42] Deprecated file system path: /mnt/things/e/f/g -- MAGIC display(spark.read.csv('/mnt/things/e/f/g')) + From 505402fc9326ac594723ca34cac18b21269d90f6 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 2 Sep 2024 16:49:14 +0200 Subject: [PATCH 07/80] enhance dbfs linting to all direct file system access --- .../labs/ucx/source_code/linters/context.py | 6 +- .../labs/ucx/source_code/linters/dbfs.py | 124 ------------ .../labs/ucx/source_code/linters/dfsa.py | 176 ++++++++++++++++++ tests/unit/source_code/linters/test_dbfs.py | 130 ------------- tests/unit/source_code/linters/test_dfsa.py | 147 +++++++++++++++ 5 files changed, 326 insertions(+), 257 deletions(-) delete mode 100644 src/databricks/labs/ucx/source_code/linters/dbfs.py create mode 100644 src/databricks/labs/ucx/source_code/linters/dfsa.py delete mode 100644 tests/unit/source_code/linters/test_dbfs.py create mode 100644 tests/unit/source_code/linters/test_dfsa.py diff --git a/src/databricks/labs/ucx/source_code/linters/context.py b/src/databricks/labs/ucx/source_code/linters/context.py index 7b87b3f2c2..1106b85612 100644 --- a/src/databricks/labs/ucx/source_code/linters/context.py +++ b/src/databricks/labs/ucx/source_code/linters/context.py @@ -12,7 +12,7 @@ PythonLinter, SqlLinter, ) -from databricks.labs.ucx.source_code.linters.dbfs import DbfsUsageSqlLinter, DBFSUsagePyLinter +from databricks.labs.ucx.source_code.linters.dfsa import DfsaPyLinter, DfsaSqlLinter from databricks.labs.ucx.source_code.linters.imports import DbutilsPyLinter from databricks.labs.ucx.source_code.linters.pyspark import SparkSqlPyLinter @@ -40,12 +40,12 @@ def __init__(self, index: MigrationIndex | None = None, session_state: CurrentSe python_fixers.append(SparkSqlPyLinter(from_table, index, session_state)) python_linters += [ - DBFSUsagePyLinter(session_state), + DfsaPyLinter(session_state), DBRv8d0PyLinter(dbr_version=session_state.dbr_version), SparkConnectPyLinter(session_state), DbutilsPyLinter(session_state), ] - sql_linters.append(DbfsUsageSqlLinter()) + sql_linters.append(DfsaSqlLinter()) self._linters: dict[Language, list[SqlLinter] | list[PythonLinter]] = { Language.PYTHON: python_linters, diff --git a/src/databricks/labs/ucx/source_code/linters/dbfs.py b/src/databricks/labs/ucx/source_code/linters/dbfs.py deleted file mode 100644 index 46a617fafc..0000000000 --- a/src/databricks/labs/ucx/source_code/linters/dbfs.py +++ /dev/null @@ -1,124 +0,0 @@ -import logging -from collections.abc import Iterable - -from astroid import Call, Const, InferenceError, NodeNG # type: ignore -from sqlglot import Expression -from sqlglot.expressions import Table - -from databricks.labs.ucx.source_code.base import ( - Advice, - Deprecation, - CurrentSessionState, - PythonLinter, - SqlLinter, -) -from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeVisitor -from databricks.labs.ucx.source_code.linters.python_infer import InferredValue - -logger = logging.getLogger(__name__) - - -class DetectDbfsVisitor(TreeVisitor): - """ - Visitor that detects file system paths in Python code and checks them - against a list of known deprecated paths. - """ - - def __init__(self, session_state: CurrentSessionState) -> None: - self._session_state = session_state - self._advices: list[Advice] = [] - self._fs_prefixes = ["/dbfs/mnt", "dbfs:/", "/mnt/"] - self._reported_locations: set[tuple[int, int]] = set() # Set to store reported locations; astroid coordinates! - - def visit_call(self, node: Call): - for arg in node.args: - self._visit_arg(arg) - - def _visit_arg(self, arg: NodeNG): - try: - for inferred in InferredValue.infer_from_node(arg, self._session_state): - if not inferred.is_inferred(): - logger.debug(f"Could not infer value of {arg.as_string()}") - continue - self._check_str_constant(arg, inferred) - except InferenceError as e: - logger.debug(f"Could not infer value of {arg.as_string()}", exc_info=e) - - def visit_const(self, node: Const): - # Constant strings yield Advisories - if isinstance(node.value, str): - self._check_str_constant(node, InferredValue([node])) - - def _check_str_constant(self, source_node, inferred: InferredValue): - if self._already_reported(source_node, inferred): - return - value = inferred.as_string() - if any(value.startswith(prefix) for prefix in self._fs_prefixes): - advisory = Deprecation.from_node( - code='dbfs-usage', - message=f"Deprecated file system path: {value}", - node=source_node, - ) - self._advices.append(advisory) - - def _already_reported(self, source_node: NodeNG, inferred: InferredValue): - all_nodes = [source_node] - all_nodes.extend(inferred.nodes) - reported = any((node.lineno, node.col_offset) in self._reported_locations for node in all_nodes) - for node in all_nodes: - self._reported_locations.add((node.lineno, node.col_offset)) - return reported - - def get_advices(self) -> Iterable[Advice]: - yield from self._advices - - -class DBFSUsagePyLinter(PythonLinter): - - def __init__(self, session_state: CurrentSessionState): - self._session_state = session_state - - @staticmethod - def name() -> str: - """ - Returns the name of the linter, for reporting etc - """ - return 'dbfs-usage' - - def lint_tree(self, tree: Tree) -> Iterable[Advice]: - """ - Lints the code looking for file system paths that are deprecated - """ - visitor = DetectDbfsVisitor(self._session_state) - visitor.visit(tree.node) - yield from visitor.get_advices() - - -class DbfsUsageSqlLinter(SqlLinter): - def __init__(self): - self._dbfs_prefixes = ["/dbfs/mnt", "dbfs:/", "/mnt/", "/dbfs/", "/"] - - @staticmethod - def name() -> str: - return 'dbfs-query' - - def lint_expression(self, expression: Expression): - for table in expression.find_all(Table): - # Check table names for deprecated DBFS table names - yield from self._check_dbfs_folder(table) - - def _check_dbfs_folder(self, table: Table) -> Iterable[Advice]: - """ - Check if the table is a DBFS table or reference in some way - and yield a deprecation message if it is - """ - if any(table.name.startswith(prefix) for prefix in self._dbfs_prefixes): - yield Deprecation( - code='dbfs-read-from-sql-query', - message=f"The use of DBFS is deprecated: {table.name}", - # SQLGlot does not propagate tokens yet. See https://github.com/tobymao/sqlglot/issues/3159 - start_line=0, - start_col=0, - end_line=0, - end_col=1024, - ) diff --git a/src/databricks/labs/ucx/source_code/linters/dfsa.py b/src/databricks/labs/ucx/source_code/linters/dfsa.py new file mode 100644 index 0000000000..a1c131c7f8 --- /dev/null +++ b/src/databricks/labs/ucx/source_code/linters/dfsa.py @@ -0,0 +1,176 @@ +from dataclasses import dataclass +import logging +from abc import ABC +from collections.abc import Iterable + +from astroid import Call, Const, InferenceError, NodeNG # type: ignore +from sqlglot import Expression +from sqlglot.expressions import Table + +from databricks.labs.ucx.source_code.base import ( + Advice, + Deprecation, + CurrentSessionState, + PythonLinter, + SqlLinter, +) +from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeVisitor +from databricks.labs.ucx.source_code.linters.python_infer import InferredValue + +logger = logging.getLogger(__name__) + + +class DFSAPattern(ABC): + + def __init__(self, prefix: str, allowed_roots: list[str]): + self._prefix = prefix + self._allowed_roots = allowed_roots + + def matches(self, value: str) -> bool: + return value.startswith(self._prefix) and not self._matches_allowed_root(value) + + def _matches_allowed_root(self, value: str): + return any(value.startswith(f"{self._prefix}/{root}") for root in self._allowed_roots) + + +class RootPattern(DFSAPattern): + + def _matches_allowed_root(self, value: str): + return any(value.startswith(f"/{root}") for root in self._allowed_roots) + + +# the below aims to implement https://docs.databricks.com/en/files/index.html +DFSA_PATTERNS = [ + DFSAPattern("dbfs:/", []), + DFSAPattern("file:/", ["Workspace/", "tmp/"]), + DFSAPattern("s3:/", []), + DFSAPattern("s3n:/", []), + DFSAPattern("s3a:/", []), + DFSAPattern("wasb:/", []), + DFSAPattern("wasbs:/", []), + DFSAPattern("abfs:/", []), + DFSAPattern("abfss:/", []), + DFSAPattern("hdfs:/", []), + DFSAPattern("/mnt/", []), + RootPattern("/", ["Volumes/", "Workspace/", "tmp/"]), +] + + +@dataclass +class DFSA: + """A DFSA is a record describing a Direct File System Access""" + + path: str + + +@dataclass +class DFSANode: + dfsa: DFSA + node: NodeNG + + +class _DetectDfsaVisitor(TreeVisitor): + """ + Visitor that detects file system paths in Python code and checks them + against a list of known deprecated paths. + """ + + def __init__(self, session_state: CurrentSessionState, allow_spark_duplicates: bool) -> None: + self._session_state = session_state + self._dfsa_nodes: list[DFSANode] = [] + self._reported_locations: set[tuple[int, int]] = set() + self._allow_spark_duplicates = allow_spark_duplicates + + def visit_call(self, node: Call): + for arg in node.args: + self._visit_arg(arg) + + def _visit_arg(self, arg: NodeNG): + try: + for inferred in InferredValue.infer_from_node(arg, self._session_state): + if not inferred.is_inferred(): + logger.debug(f"Could not infer value of {arg.as_string()}") + continue + self._check_str_constant(arg, inferred) + except InferenceError as e: + logger.debug(f"Could not infer value of {arg.as_string()}", exc_info=e) + + def visit_const(self, node: Const): + # Constant strings yield Advisories + if isinstance(node.value, str): + self._check_str_constant(node, InferredValue([node])) + + def _check_str_constant(self, source_node, inferred: InferredValue): + if self._already_reported(source_node, inferred): + return + # avoid duplicate advices that are reported by SparkSqlPyLinter + if Tree(source_node).is_from_module("spark") and not self._allow_spark_duplicates: + return + value = inferred.as_string() + if any(pattern.matches(value) for pattern in DFSA_PATTERNS): + self._dfsa_nodes.append(DFSANode(DFSA(value), source_node)) + self._reported_locations.add((source_node.lineno, source_node.col_offset)) + + def _already_reported(self, source_node: NodeNG, inferred: InferredValue): + all_nodes = [source_node] + inferred.nodes + return any((node.lineno, node.col_offset) in self._reported_locations for node in all_nodes) + + @property + def dfsa_nodes(self): + return self._dfsa_nodes + + +class DfsaPyLinter(PythonLinter): + + def __init__(self, session_state: CurrentSessionState, allow_spark_duplicates=False): + self._session_state = session_state + self._allow_spark_duplicates = allow_spark_duplicates + + @staticmethod + def name() -> str: + """ + Returns the name of the linter, for reporting etc + """ + return 'dfsa-usage' + + def lint_tree(self, tree: Tree) -> Iterable[Advice]: + """ + Lints the code looking for file system paths that are deprecated + """ + visitor = _DetectDfsaVisitor(self._session_state, self._allow_spark_duplicates) + visitor.visit(tree.node) + for dfsa_node in visitor.dfsa_nodes: + advisory = Deprecation.from_node( + code='direct-filesystem-access', + message=f"The use of direct filesystem references is deprecated: {dfsa_node.dfsa.path}", + node=dfsa_node.node, + ) + yield advisory + + +class DfsaSqlLinter(SqlLinter): + + @staticmethod + def name() -> str: + return 'dfsa-query' + + def lint_expression(self, expression: Expression): + for table in expression.find_all(Table): + # Check table names for direct file system access + yield from self._check_dfsa(table) + + def _check_dfsa(self, table: Table) -> Iterable[Advice]: + """ + Check if the table is a DBFS table or reference in some way + and yield a deprecation message if it is + """ + if any(pattern.matches(table.name) for pattern in DFSA_PATTERNS): + yield Deprecation( + code='direct-filesystem-access-in-sql-query', + message=f"The use of direct filesystem references is deprecated: {table.name}", + # SQLGlot does not propagate tokens yet. See https://github.com/tobymao/sqlglot/issues/3159 + start_line=0, + start_col=0, + end_line=0, + end_col=1024, + ) diff --git a/tests/unit/source_code/linters/test_dbfs.py b/tests/unit/source_code/linters/test_dbfs.py deleted file mode 100644 index b71113b0a8..0000000000 --- a/tests/unit/source_code/linters/test_dbfs.py +++ /dev/null @@ -1,130 +0,0 @@ -import pytest - -from databricks.labs.ucx.source_code.base import Deprecation, Advice, CurrentSessionState, Failure -from databricks.labs.ucx.source_code.linters.dbfs import DBFSUsagePyLinter, DbfsUsageSqlLinter - - -class TestDetectDBFS: - @pytest.mark.parametrize( - "code, expected", - [ - ('SOME_CONSTANT = "not a file system path"', 0), - ('SOME_CONSTANT = ("/dbfs/mnt", "dbfs:/", "/mnt/")', 3), - ('# "/dbfs/mnt"', 0), - ('SOME_CONSTANT = "/dbfs/mnt"', 1), - ('SOME_CONSTANT = "/dbfs/mnt"; load_data(SOME_CONSTANT)', 1), - ('SOME_CONSTANT = 42; load_data(SOME_CONSTANT)', 0), - ], - ) - def test_detects_dbfs_paths(self, code, expected): - linter = DBFSUsagePyLinter(CurrentSessionState()) - advices = list(linter.lint(code)) - for advice in advices: - assert isinstance(advice, Advice) - assert len(advices) == expected - - @pytest.mark.parametrize( - "code, expected", - [ - ("load_data('/dbfs/mnt/data')", 1), - ("load_data('/data')", 0), - ("load_data('/dbfs/mnt/data', '/data')", 1), - ("# load_data('/dbfs/mnt/data', '/data')", 0), - ('spark.read.parquet("/mnt/foo/bar")', 1), - ('spark.read.parquet("dbfs:/mnt/foo/bar")', 1), - ('spark.read.parquet("dbfs://mnt/foo/bar")', 1), - ('DBFS="dbfs:/mnt/foo/bar"; spark.read.parquet(DBFS)', 1), - ( - """ -DBFS1="dbfs:/mnt/foo/bar1" -systems=[DBFS1, "dbfs:/mnt/foo/bar2"] -for system in systems: - spark.read.parquet(system) -""", - 2, - ), - ], - ) - def test_dbfs_usage_linter(self, code, expected): - linter = DBFSUsagePyLinter(CurrentSessionState()) - advices = linter.lint(code) - count = 0 - for advice in advices: - if isinstance(advice, Deprecation): - count += 1 - assert count == expected - - def test_dbfs_name(self): - linter = DBFSUsagePyLinter(CurrentSessionState()) - assert linter.name() == "dbfs-usage" - - -@pytest.mark.parametrize( - "query", - [ - "SELECT * FROM old.things LEFT JOIN hive_metastore.other.matters USING (x) WHERE state > 1 LIMIT 10", - "SELECT * FROM json.`s3a://abc/d/e/f`", - "SELECT * FROM delta.`s3a://abc/d/e/f` WHERE foo > 6", - "SELECT * FROM delta.`s3a://foo/bar`", - # Make sure non-sql doesn't just fail - "print('hello')", - "", - ], -) -def test_non_dbfs_trigger_nothing(query): - ftf = DbfsUsageSqlLinter() - assert not list(ftf.lint(query)) - - -@pytest.mark.parametrize( - "query, table", - [ - ('SELECT * FROM parquet.`dbfs:/...` LIMIT 10', "dbfs:/..."), - ("SELECT * FROM delta.`/mnt/...` WHERE foo > 6", "/mnt/..."), - ("SELECT * FROM json.`/a/b/c` WHERE foo > 6", "/a/b/c"), - ("DELETE FROM json.`/...` WHERE foo = 'bar'", "/..."), - ( - "MERGE INTO delta.`/dbfs/...` t USING source ON t.key = source.key WHEN MATCHED THEN DELETE", - "/dbfs/...", - ), - ], -) -def test_dbfs_tables_trigger_messages_param(query: str, table: str): - ftf = DbfsUsageSqlLinter() - actual = list(ftf.lint(query)) - assert actual == [ - Deprecation( - code='dbfs-read-from-sql-query', - message=f'The use of DBFS is deprecated: {table}', - start_line=0, - start_col=0, - end_line=0, - end_col=1024, - ), - ] - - -@pytest.mark.parametrize( - "query", - [ - 'SELECT * FROM {{some_db.some_table}}', - ], -) -def test_dbfs_queries_failure(query: str): - ftf = DbfsUsageSqlLinter() - actual = list(ftf.lint(query)) - assert actual == [ - Failure( - code='sql-parse-error', - message=f'SQL expression is not supported yet: {query}', - start_line=0, - start_col=0, - end_line=0, - end_col=1024, - ), - ] - - -def test_dbfs_queries_name(): - ftf = DbfsUsageSqlLinter() - assert ftf.name() == 'dbfs-query' diff --git a/tests/unit/source_code/linters/test_dfsa.py b/tests/unit/source_code/linters/test_dfsa.py new file mode 100644 index 0000000000..e94f1b1192 --- /dev/null +++ b/tests/unit/source_code/linters/test_dfsa.py @@ -0,0 +1,147 @@ +import pytest + +from databricks.labs.ucx.source_code.base import Deprecation, Advice, CurrentSessionState, Failure +from databricks.labs.ucx.source_code.linters.dfsa import DfsaPyLinter, DfsaSqlLinter, DFSA_PATTERNS + + +@pytest.mark.parametrize( + "path, matches", + [ + ("/mnt/foo/bar", True), + ("dbfs:/mnt/foo/bar", True), + ("s3a://bucket1/folder1", True), + ("/dbfs/mnt/foo/bar", True), + ("/tmp/foo", False), + ("table.we.know.nothing.about", False), + ], +) +def test_matches_dfsa_pattern(path, matches): + """see https://github.com/databrickslabs/ucx/issues/2350""" + matched = any(pattern.matches(path) for pattern in DFSA_PATTERNS) + assert matches == matched + + +@pytest.mark.parametrize( + "code, expected", + [ + ('SOME_CONSTANT = "not a file system path"', 0), + ('SOME_CONSTANT = ("/dbfs/mnt", "dbfs:/", "/mnt/")', 3), + ('# "/dbfs/mnt"', 0), + ('SOME_CONSTANT = "/dbfs/mnt"', 1), + ('SOME_CONSTANT = "/dbfs/mnt"; load_data(SOME_CONSTANT)', 1), + ('SOME_CONSTANT = 42; load_data(SOME_CONSTANT)', 0), + ], +) +def test_detects_dfsa_paths(code, expected): + linter = DfsaPyLinter(CurrentSessionState(), allow_spark_duplicates=True) + advices = list(linter.lint(code)) + for advice in advices: + assert isinstance(advice, Advice) + assert len(advices) == expected + +@pytest.mark.parametrize( + "code, expected", + [ + ("load_data('/dbfs/mnt/data')", 1), + ("load_data('/data')", 1), + ("load_data('/dbfs/mnt/data', '/data')", 2), + ("# load_data('/dbfs/mnt/data', '/data')", 0), + ('spark.read.parquet("/mnt/foo/bar")', 1), + ('spark.read.parquet("dbfs:/mnt/foo/bar")', 1), + ('spark.read.parquet("dbfs://mnt/foo/bar")', 1), + ('DBFS="dbfs:/mnt/foo/bar"; spark.read.parquet(DBFS)', 1), + (""" +DBFS1="dbfs:/mnt/foo/bar1" +systems=[DBFS1, "dbfs:/mnt/foo/bar2"] +for system in systems: + spark.read.parquet(system) +""", + 2, + ), + ], + ) +def test_dfsa_usage_linter(code, expected): + linter = DfsaPyLinter(CurrentSessionState(), allow_spark_duplicates=True) + advices = linter.lint(code) + count = 0 + for advice in advices: + if isinstance(advice, Deprecation): + count += 1 + assert count == expected + + +def test_dfsa_name(): + linter = DfsaPyLinter(CurrentSessionState()) + assert linter.name() == "dfsa-usage" + + +@pytest.mark.parametrize( + "query", + [ + "SELECT * FROM old.things LEFT JOIN hive_metastore.other.matters USING (x) WHERE state > 1 LIMIT 10", + # Make sure non-sql doesn't just fail + "print('hello')", + "", + ], +) +def test_non_dfsa_triggers_nothing(query): + ftf = DfsaSqlLinter() + assert not list(ftf.lint(query)) + + +@pytest.mark.parametrize( + "query, table", + [ + ('SELECT * FROM parquet.`dbfs:/...` LIMIT 10', "dbfs:/..."), + ("SELECT * FROM delta.`/mnt/...` WHERE foo > 6", "/mnt/..."), + ("SELECT * FROM json.`/a/b/c` WHERE foo > 6", "/a/b/c"), + ("DELETE FROM json.`/...` WHERE foo = 'bar'", "/..."), + ( + "MERGE INTO delta.`/dbfs/...` t USING source ON t.key = source.key WHEN MATCHED THEN DELETE", + "/dbfs/...", + ), + ("SELECT * FROM json.`s3a://abc/d/e/f`", "s3a://abc/d/e/f"), + ("SELECT * FROM delta.`s3a://abc/d/e/f` WHERE foo > 6", "s3a://abc/d/e/f"), + ("SELECT * FROM delta.`s3a://foo/bar`", "s3a://foo/bar"), + ("SELECT * FROM csv.`dbfs:/mnt/foo`", "dbfs:/mnt/foo"), + ], +) +def test_dfsa_tables_trigger_messages_param(query: str, table: str): + ftf = DfsaSqlLinter() + actual = list(ftf.lint(query)) + assert actual == [ + Deprecation( + code='direct-filesystem-access-in-sql-query', + message=f'The use of direct filesystem references is deprecated: {table}', + start_line=0, + start_col=0, + end_line=0, + end_col=1024, + ), + ] + + +@pytest.mark.parametrize( + "query", + [ + 'SELECT * FROM {{some_db.some_table}}', + ], +) +def test_dfsa_queries_failure(query: str): + ftf = DfsaSqlLinter() + actual = list(ftf.lint(query)) + assert actual == [ + Failure( + code='sql-parse-error', + message=f'SQL expression is not supported yet: {query}', + start_line=0, + start_col=0, + end_line=0, + end_col=1024, + ), + ] + + +def test_dfsa_queries_name(): + ftf = DfsaSqlLinter() + assert ftf.name() == 'dfsa-query' From 992ffe7e81eedad8c03f82bb3781e23a6bc8fb20 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 2 Sep 2024 16:49:57 +0200 Subject: [PATCH 08/80] use dfsa for pyspark --- .../labs/ucx/source_code/linters/pyspark.py | 47 +++++++------------ .../unit/source_code/linters/test_pyspark.py | 8 ++-- 2 files changed, 20 insertions(+), 35 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index 1b3e51d67b..1f1eae7fce 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -1,3 +1,4 @@ +import logging from abc import ABC, abstractmethod from collections.abc import Iterable, Iterator from dataclasses import dataclass @@ -12,11 +13,15 @@ CurrentSessionState, PythonLinter, ) +from databricks.labs.ucx.source_code.linters.dfsa import DFSA_PATTERNS from databricks.labs.ucx.source_code.linters.python_infer import InferredValue from databricks.labs.ucx.source_code.queries import FromTableSqlLinter from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeHelper +logger = logging.getLogger(__name__) + + @dataclass class Matcher(ABC): method_name: str @@ -178,18 +183,6 @@ def apply(self, from_table: FromTableSqlLinter, index: MigrationIndex, node: Cal @dataclass class DirectFilesystemAccessMatcher(Matcher): - _DIRECT_FS_REFS = { - "s3a://", - "s3n://", - "s3://", - "wasb://", - "wasbs://", - "abfs://", - "abfss://", - "dbfs:/", - "hdfs://", - "file:/", - } def matches(self, node: NodeNG): return ( @@ -203,25 +196,17 @@ def lint( self, from_table: FromTableSqlLinter, index: MigrationIndex, session_state: CurrentSessionState, node: NodeNG ) -> Iterator[Advice]: table_arg = self._get_table_arg(node) - if not isinstance(table_arg, Const): - return - if not table_arg.value: - return - if not isinstance(table_arg.value, str): - return - if any(table_arg.value.startswith(prefix) for prefix in self._DIRECT_FS_REFS): - yield Deprecation.from_node( - code='direct-filesystem-access', - message=f"The use of direct filesystem references is deprecated: {table_arg.value}", - node=node, - ) - return - if table_arg.value.startswith("/") and self._check_call_context(node): - yield Deprecation.from_node( - code='implicit-dbfs-usage', - message=f"The use of default dbfs: references is deprecated: {table_arg.value}", - node=node, - ) + for inferred in InferredValue.infer_from_node(table_arg): + if not inferred.is_inferred(): + logger.debug(f"Could not infer value of {table_arg.as_string()}") + continue + value = inferred.as_string() + if any(pattern.matches(value) for pattern in DFSA_PATTERNS): + yield Deprecation.from_node( + code='direct-filesystem-access', + message=f"The use of direct filesystem references is deprecated: {value}", + node=node, + ) def apply(self, from_table: FromTableSqlLinter, index: MigrationIndex, node: Call) -> None: # No transformations to apply diff --git a/tests/unit/source_code/linters/test_pyspark.py b/tests/unit/source_code/linters/test_pyspark.py index cda1997e06..cbb2dd5b15 100644 --- a/tests/unit/source_code/linters/test_pyspark.py +++ b/tests/unit/source_code/linters/test_pyspark.py @@ -346,8 +346,8 @@ def test_spark_sql_fix(migration_index): """spark.read.load("/bucket/path")""", [ Deprecation( - code='implicit-dbfs-usage', - message="The use of default dbfs: references is deprecated: /bucket/path", + code='direct-filesystem-access', + message="The use of direct filesystem references is deprecated: /bucket/path", start_line=0, start_col=0, end_line=0, @@ -559,12 +559,12 @@ def test_spark_cloud_direct_access(empty_index, code, expected): @pytest.mark.parametrize("fs_function", FS_FUNCTIONS) -def test_direct_cloud_access_reports_nothing(empty_index, fs_function): +def test_direct_cloud_access_to_tmp_reports_nothing(empty_index, fs_function): session_state = CurrentSessionState() ftf = FromTableSqlLinter(empty_index, session_state) sqf = SparkSqlPyLinter(ftf, empty_index, session_state) # ls function calls have to be from dbutils.fs, or we ignore them - code = f"""spark.{fs_function}("/bucket/path")""" + code = f"""spark.{fs_function}("/tmp/bucket/path")""" advisories = list(sqf.lint(code)) assert not advisories From 830b0269da399921ddaf6078b202c0b1bf56bf9a Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 2 Sep 2024 16:50:28 +0200 Subject: [PATCH 09/80] fix duplicate advice --- src/databricks/labs/ucx/source_code/linters/python_ast.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/databricks/labs/ucx/source_code/linters/python_ast.py b/src/databricks/labs/ucx/source_code/linters/python_ast.py index 00baad629f..7ce7228dd9 100644 --- a/src/databricks/labs/ucx/source_code/linters/python_ast.py +++ b/src/databricks/labs/ucx/source_code/linters/python_ast.py @@ -179,6 +179,8 @@ def is_from_module(self, module_name: str) -> bool: return isinstance(self._node.func, Attribute) and Tree(self._node.func.expr).is_from_module(module_name) if isinstance(self._node, Attribute): return Tree(self._node.expr).is_from_module(module_name) + if isinstance(self._node, Const): + return Tree(self._node.parent).is_from_module(module_name) return False def has_global(self, name: str) -> bool: From a1e15e71a651cfcda593bf29d034ed38ee79303c Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 2 Sep 2024 16:50:44 +0200 Subject: [PATCH 10/80] fix functional tests --- .../samples/functional/file-access/direct-fs.py | 7 +++---- .../file-access/python-notebook-with-embedded-sql.py | 7 +++---- .../file-access/sql-notebook-with-embedded-python.sql | 5 ++--- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/tests/unit/source_code/samples/functional/file-access/direct-fs.py b/tests/unit/source_code/samples/functional/file-access/direct-fs.py index 6815bb9627..251c6f6072 100644 --- a/tests/unit/source_code/samples/functional/file-access/direct-fs.py +++ b/tests/unit/source_code/samples/functional/file-access/direct-fs.py @@ -3,13 +3,12 @@ # COMMAND ---------- -# ucx[dbfs-usage:+2:23:+2:42] Deprecated file system path: /mnt/things/e/f/g -# ucx[implicit-dbfs-usage:+1:8:+1:43] The use of default dbfs: references is deprecated: /mnt/things/e/f/g +# ucx[direct-filesystem-access:+1:8:+1:43] The use of direct filesystem references is deprecated: /mnt/things/e/f/g display(spark.read.csv('/mnt/things/e/f/g')) # COMMAND ---------- -# ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: dbfs:/mnt/foo +# ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: dbfs:/mnt/foo # MAGIC %sql SELECT * FROM csv.`dbfs:/mnt/foo` # COMMAND ---------- @@ -18,7 +17,7 @@ # COMMAND ---------- -# ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: dbfs:/mnt/bar/e/f/g +# ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: dbfs:/mnt/bar/e/f/g # MAGIC %sql # MAGIC SELECT * FROM # MAGIC csv.`dbfs:/mnt/bar/e/f/g` diff --git a/tests/unit/source_code/samples/functional/file-access/python-notebook-with-embedded-sql.py b/tests/unit/source_code/samples/functional/file-access/python-notebook-with-embedded-sql.py index e0b7504e09..be071b69d7 100644 --- a/tests/unit/source_code/samples/functional/file-access/python-notebook-with-embedded-sql.py +++ b/tests/unit/source_code/samples/functional/file-access/python-notebook-with-embedded-sql.py @@ -3,13 +3,12 @@ # COMMAND ---------- -# ucx[implicit-dbfs-usage:+2:8:+2:43] The use of default dbfs: references is deprecated: /mnt/things/e/f/g -# ucx[dbfs-usage:+1:23:+1:42] Deprecated file system path: /mnt/things/e/f/g +# ucx[direct-filesystem-access:+1:8:+1:43] The use of direct filesystem references is deprecated: /mnt/things/e/f/g display(spark.read.csv('/mnt/things/e/f/g')) # COMMAND ---------- -# ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: dbfs:/mnt/foo +# ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: dbfs:/mnt/foo # MAGIC %sql SELECT * FROM csv.`dbfs:/mnt/foo` # COMMAND ---------- @@ -22,7 +21,7 @@ # COMMAND ---------- -# ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: dbfs:/mnt/bar/e/f/g +# ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: dbfs:/mnt/bar/e/f/g # MAGIC %sql # MAGIC SELECT * FROM # MAGIC csv.`dbfs:/mnt/bar/e/f/g` diff --git a/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql b/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql index 2a9361fad5..4de8f4adb5 100644 --- a/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql +++ b/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql @@ -3,7 +3,7 @@ -- COMMAND ---------- --- ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: dbfs:/mnt/whatever +-- ucx[direct-file-system-access-in-sql-query:+0:0:+0:1024] The use of direct file system access is deprecated: dbfs:/mnt/whatever SELECT * FROM csv.`dbfs:/mnt/whatever` @@ -13,7 +13,6 @@ SELECT * FROM csv.`dbfs:/mnt/whatever` -- COMMAND ---------- -- MAGIC %python --- ucx[implicit-dbfs-usage:+2:8:+2:43] The use of default dbfs: references is deprecated: /mnt/things/e/f/g --- ucx[dbfs-usage:+1:23:+1:42] Deprecated file system path: /mnt/things/e/f/g +-- ucx[direct-file-system-access:+0:0:+0:1024] The use of direct file system access is deprecated: /mnt/things/e/f/g -- MAGIC display(spark.read.csv('/mnt/things/e/f/g')) From eb566364874dd103ee529c680a8e0faf831f4342 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 2 Sep 2024 16:56:23 +0200 Subject: [PATCH 11/80] formatting --- tests/unit/source_code/linters/test_dfsa.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/unit/source_code/linters/test_dfsa.py b/tests/unit/source_code/linters/test_dfsa.py index e94f1b1192..11cb5a1d08 100644 --- a/tests/unit/source_code/linters/test_dfsa.py +++ b/tests/unit/source_code/linters/test_dfsa.py @@ -39,27 +39,29 @@ def test_detects_dfsa_paths(code, expected): assert isinstance(advice, Advice) assert len(advices) == expected + @pytest.mark.parametrize( "code, expected", [ ("load_data('/dbfs/mnt/data')", 1), - ("load_data('/data')", 1), - ("load_data('/dbfs/mnt/data', '/data')", 2), + ("load_data('/data')", 1), + ("load_data('/dbfs/mnt/data', '/data')", 2), ("# load_data('/dbfs/mnt/data', '/data')", 0), ('spark.read.parquet("/mnt/foo/bar")', 1), ('spark.read.parquet("dbfs:/mnt/foo/bar")', 1), ('spark.read.parquet("dbfs://mnt/foo/bar")', 1), ('DBFS="dbfs:/mnt/foo/bar"; spark.read.parquet(DBFS)', 1), - (""" + ( + """ DBFS1="dbfs:/mnt/foo/bar1" systems=[DBFS1, "dbfs:/mnt/foo/bar2"] for system in systems: spark.read.parquet(system) """, - 2, - ), - ], - ) + 2, + ), + ], +) def test_dfsa_usage_linter(code, expected): linter = DfsaPyLinter(CurrentSessionState(), allow_spark_duplicates=True) advices = linter.lint(code) From b8e6b826f7e43ddd2ce4dfce0aafa0daa05fd7a9 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 2 Sep 2024 18:46:19 +0200 Subject: [PATCH 12/80] move python parsing code under dedicated package --- src/databricks/labs/ucx/source_code/base.py | 2 +- src/databricks/labs/ucx/source_code/graph.py | 2 +- src/databricks/labs/ucx/source_code/jobs.py | 2 +- src/databricks/labs/ucx/source_code/linters/dbfs.py | 4 ++-- src/databricks/labs/ucx/source_code/linters/files.py | 2 +- src/databricks/labs/ucx/source_code/linters/imports.py | 4 ++-- src/databricks/labs/ucx/source_code/linters/pyspark.py | 4 ++-- src/databricks/labs/ucx/source_code/linters/spark_connect.py | 2 +- src/databricks/labs/ucx/source_code/linters/table_creation.py | 2 +- src/databricks/labs/ucx/source_code/notebooks/cells.py | 2 +- src/databricks/labs/ucx/source_code/notebooks/sources.py | 2 +- src/databricks/labs/ucx/source_code/python/__init__.py | 0 .../labs/ucx/source_code/{linters => python}/python_ast.py | 0 .../labs/ucx/source_code/{linters => python}/python_infer.py | 2 +- tests/integration/source_code/message_codes.py | 2 +- tests/unit/source_code/linters/test_pyspark.py | 2 +- tests/unit/source_code/linters/test_python_imports.py | 2 +- tests/unit/source_code/linters/test_spark_connect.py | 2 +- tests/unit/source_code/notebooks/test_cells.py | 2 +- tests/unit/source_code/python/__init__.py | 0 tests/unit/source_code/{linters => python}/test_python_ast.py | 4 ++-- .../unit/source_code/{linters => python}/test_python_infer.py | 4 ++-- tests/unit/source_code/test_notebook.py | 2 +- 23 files changed, 25 insertions(+), 25 deletions(-) create mode 100644 src/databricks/labs/ucx/source_code/python/__init__.py rename src/databricks/labs/ucx/source_code/{linters => python}/python_ast.py (100%) rename src/databricks/labs/ucx/source_code/{linters => python}/python_infer.py (99%) create mode 100644 tests/unit/source_code/python/__init__.py rename tests/unit/source_code/{linters => python}/test_python_ast.py (97%) rename tests/unit/source_code/{linters => python}/test_python_infer.py (97%) diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index 57acaf56c4..e00bc86847 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -15,7 +15,7 @@ from databricks.sdk.service.workspace import Language from databricks.labs.blueprint.paths import WorkspacePath -from databricks.labs.ucx.source_code.linters.python_ast import Tree +from databricks.labs.ucx.source_code.python.python_ast import Tree # Code mapping between LSP, PyLint, and our own diagnostics: # | LSP | PyLint | Our | diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py index 2d47649839..4841fe904c 100644 --- a/src/databricks/labs/ucx/source_code/graph.py +++ b/src/databricks/labs/ucx/source_code/graph.py @@ -11,7 +11,7 @@ NodeNG, ) from databricks.labs.ucx.source_code.base import Advisory, CurrentSessionState, is_a_notebook -from databricks.labs.ucx.source_code.linters.python_ast import Tree +from databricks.labs.ucx.source_code.python.python_ast import Tree from databricks.labs.ucx.source_code.path_lookup import PathLookup logger = logging.Logger(__name__) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 4a6274f31c..feab7bc1d0 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -31,7 +31,7 @@ DependencyGraphWalker, ) from databricks.labs.ucx.source_code.linters.context import LinterContext -from databricks.labs.ucx.source_code.linters.python_ast import Tree +from databricks.labs.ucx.source_code.python.python_ast import Tree from databricks.labs.ucx.source_code.notebooks.sources import FileLinter from databricks.labs.ucx.source_code.path_lookup import PathLookup diff --git a/src/databricks/labs/ucx/source_code/linters/dbfs.py b/src/databricks/labs/ucx/source_code/linters/dbfs.py index 46a617fafc..06941ef97b 100644 --- a/src/databricks/labs/ucx/source_code/linters/dbfs.py +++ b/src/databricks/labs/ucx/source_code/linters/dbfs.py @@ -12,8 +12,8 @@ PythonLinter, SqlLinter, ) -from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeVisitor -from databricks.labs.ucx.source_code.linters.python_infer import InferredValue +from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeVisitor +from databricks.labs.ucx.source_code.python.python_infer import InferredValue logger = logging.getLogger(__name__) diff --git a/src/databricks/labs/ucx/source_code/linters/files.py b/src/databricks/labs/ucx/source_code/linters/files.py index 31d4a7faa6..d1eb2c8a27 100644 --- a/src/databricks/labs/ucx/source_code/linters/files.py +++ b/src/databricks/labs/ucx/source_code/linters/files.py @@ -7,7 +7,7 @@ from typing import TextIO from databricks.labs.ucx.source_code.base import LocatedAdvice, CurrentSessionState, file_language, is_a_notebook -from databricks.labs.ucx.source_code.linters.python_ast import Tree +from databricks.labs.ucx.source_code.python.python_ast import Tree from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader from databricks.labs.ucx.source_code.notebooks.sources import FileLinter from databricks.labs.ucx.source_code.path_lookup import PathLookup diff --git a/src/databricks/labs/ucx/source_code/linters/imports.py b/src/databricks/labs/ucx/source_code/linters/imports.py index 26a1258dff..0cbb79e5a1 100644 --- a/src/databricks/labs/ucx/source_code/linters/imports.py +++ b/src/databricks/labs/ucx/source_code/linters/imports.py @@ -18,8 +18,8 @@ ) from databricks.labs.ucx.source_code.base import Advice, Advisory, CurrentSessionState, PythonLinter -from databricks.labs.ucx.source_code.linters.python_ast import Tree, NodeBase, TreeVisitor -from databricks.labs.ucx.source_code.linters.python_infer import InferredValue +from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase, TreeVisitor +from databricks.labs.ucx.source_code.python.python_infer import InferredValue from databricks.labs.ucx.source_code.path_lookup import PathLookup logger = logging.getLogger(__name__) diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index 1b3e51d67b..f30e50cb91 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -12,9 +12,9 @@ CurrentSessionState, PythonLinter, ) -from databricks.labs.ucx.source_code.linters.python_infer import InferredValue +from databricks.labs.ucx.source_code.python.python_infer import InferredValue from databricks.labs.ucx.source_code.queries import FromTableSqlLinter -from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeHelper +from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper @dataclass diff --git a/src/databricks/labs/ucx/source_code/linters/spark_connect.py b/src/databricks/labs/ucx/source_code/linters/spark_connect.py index f02e254406..1d79e0cb29 100644 --- a/src/databricks/labs/ucx/source_code/linters/spark_connect.py +++ b/src/databricks/labs/ucx/source_code/linters/spark_connect.py @@ -11,7 +11,7 @@ ) from databricks.sdk.service.compute import DataSecurityMode -from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeHelper +from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper @dataclass diff --git a/src/databricks/labs/ucx/source_code/linters/table_creation.py b/src/databricks/labs/ucx/source_code/linters/table_creation.py index 4720944a99..4c27865016 100644 --- a/src/databricks/labs/ucx/source_code/linters/table_creation.py +++ b/src/databricks/labs/ucx/source_code/linters/table_creation.py @@ -9,7 +9,7 @@ Advice, PythonLinter, ) -from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeHelper +from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper @dataclass diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py index 5d8aa5ac56..dc0d90870a 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/cells.py +++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py @@ -30,7 +30,7 @@ NotebookRunCall, UnresolvedPath, ) -from databricks.labs.ucx.source_code.linters.python_ast import Tree, NodeBase +from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase # use a specific logger for sqlglot warnings so we can disable them selectively sqlglot_logger = logging.getLogger(f"{__name__}.sqlglot") diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 2c17476931..c2cffde9c4 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -35,7 +35,7 @@ SysPathChange, UnresolvedPath, ) -from databricks.labs.ucx.source_code.linters.python_ast import Tree, NodeBase +from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase from databricks.labs.ucx.source_code.notebooks.cells import ( CellLanguage, Cell, diff --git a/src/databricks/labs/ucx/source_code/python/__init__.py b/src/databricks/labs/ucx/source_code/python/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/databricks/labs/ucx/source_code/linters/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py similarity index 100% rename from src/databricks/labs/ucx/source_code/linters/python_ast.py rename to src/databricks/labs/ucx/source_code/python/python_ast.py diff --git a/src/databricks/labs/ucx/source_code/linters/python_infer.py b/src/databricks/labs/ucx/source_code/python/python_infer.py similarity index 99% rename from src/databricks/labs/ucx/source_code/linters/python_infer.py rename to src/databricks/labs/ucx/source_code/python/python_infer.py index 073ab362a6..2ed7929260 100644 --- a/src/databricks/labs/ucx/source_code/linters/python_infer.py +++ b/src/databricks/labs/ucx/source_code/python/python_infer.py @@ -21,7 +21,7 @@ from astroid.exceptions import InferenceError # type: ignore from databricks.labs.ucx.source_code.base import CurrentSessionState -from databricks.labs.ucx.source_code.linters.python_ast import Tree +from databricks.labs.ucx.source_code.python.python_ast import Tree logger = logging.getLogger(__name__) diff --git a/tests/integration/source_code/message_codes.py b/tests/integration/source_code/message_codes.py index f07a049cb2..f118b4e871 100644 --- a/tests/integration/source_code/message_codes.py +++ b/tests/integration/source_code/message_codes.py @@ -2,7 +2,7 @@ from databricks.labs.blueprint.wheels import ProductInfo from databricks.labs.ucx.source_code.base import Advice -from databricks.labs.ucx.source_code.linters.python_ast import Tree +from databricks.labs.ucx.source_code.python.python_ast import Tree def main(): diff --git a/tests/unit/source_code/linters/test_pyspark.py b/tests/unit/source_code/linters/test_pyspark.py index cda1997e06..e176a4ae9e 100644 --- a/tests/unit/source_code/linters/test_pyspark.py +++ b/tests/unit/source_code/linters/test_pyspark.py @@ -3,7 +3,7 @@ from astroid import Call, Const, Expr # type: ignore from databricks.labs.ucx.source_code.base import Deprecation, CurrentSessionState -from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeHelper +from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper from databricks.labs.ucx.source_code.linters.pyspark import TableNameMatcher, SparkSqlPyLinter from databricks.labs.ucx.source_code.queries import FromTableSqlLinter diff --git a/tests/unit/source_code/linters/test_python_imports.py b/tests/unit/source_code/linters/test_python_imports.py index 1c51edadb4..27d44a3482 100644 --- a/tests/unit/source_code/linters/test_python_imports.py +++ b/tests/unit/source_code/linters/test_python_imports.py @@ -9,7 +9,7 @@ from databricks.labs.ucx.source_code.linters.files import FileLoader from databricks.labs.ucx.source_code.linters.imports import DbutilsPyLinter, ImportSource, SysPathChange -from databricks.labs.ucx.source_code.linters.python_ast import Tree +from databricks.labs.ucx.source_code.python.python_ast import Tree from databricks.labs.ucx.source_code.notebooks.cells import PythonCodeAnalyzer diff --git a/tests/unit/source_code/linters/test_spark_connect.py b/tests/unit/source_code/linters/test_spark_connect.py index 6e4b0e75e1..68c9048c06 100644 --- a/tests/unit/source_code/linters/test_spark_connect.py +++ b/tests/unit/source_code/linters/test_spark_connect.py @@ -4,7 +4,7 @@ from databricks.labs.ucx.source_code.base import Failure, CurrentSessionState -from databricks.labs.ucx.source_code.linters.python_ast import Tree +from databricks.labs.ucx.source_code.python.python_ast import Tree from databricks.labs.ucx.source_code.linters.spark_connect import LoggingMatcher, SparkConnectPyLinter from databricks.sdk.service.compute import DataSecurityMode diff --git a/tests/unit/source_code/notebooks/test_cells.py b/tests/unit/source_code/notebooks/test_cells.py index 8d24bbb924..779099576c 100644 --- a/tests/unit/source_code/notebooks/test_cells.py +++ b/tests/unit/source_code/notebooks/test_cells.py @@ -8,7 +8,7 @@ from databricks.labs.ucx.source_code.base import CurrentSessionState from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver, DependencyProblem from databricks.labs.ucx.source_code.linters.files import FileLoader, ImportFileResolver -from databricks.labs.ucx.source_code.linters.python_ast import Tree +from databricks.labs.ucx.source_code.python.python_ast import Tree from databricks.labs.ucx.source_code.notebooks.cells import ( CellLanguage, PipCell, diff --git a/tests/unit/source_code/python/__init__.py b/tests/unit/source_code/python/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/unit/source_code/linters/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py similarity index 97% rename from tests/unit/source_code/linters/test_python_ast.py rename to tests/unit/source_code/python/test_python_ast.py index 4b8c8e7152..c80abb5ceb 100644 --- a/tests/unit/source_code/linters/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -1,8 +1,8 @@ import pytest from astroid import Assign, AstroidSyntaxError, Attribute, Call, Const, Expr, Name # type: ignore -from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeHelper -from databricks.labs.ucx.source_code.linters.python_infer import InferredValue +from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper +from databricks.labs.ucx.source_code.python.python_infer import InferredValue def test_extracts_root(): diff --git a/tests/unit/source_code/linters/test_python_infer.py b/tests/unit/source_code/python/test_python_infer.py similarity index 97% rename from tests/unit/source_code/linters/test_python_infer.py rename to tests/unit/source_code/python/test_python_infer.py index f838cb7154..38f3b4cb6f 100644 --- a/tests/unit/source_code/linters/test_python_infer.py +++ b/tests/unit/source_code/python/test_python_infer.py @@ -1,8 +1,8 @@ from astroid import Assign # type: ignore from databricks.labs.ucx.source_code.base import CurrentSessionState -from databricks.labs.ucx.source_code.linters.python_ast import Tree -from databricks.labs.ucx.source_code.linters.python_infer import InferredValue +from databricks.labs.ucx.source_code.python.python_ast import Tree +from databricks.labs.ucx.source_code.python.python_infer import InferredValue def test_infers_empty_list(): diff --git a/tests/unit/source_code/test_notebook.py b/tests/unit/source_code/test_notebook.py index 5b9fffa34a..9e2945a10d 100644 --- a/tests/unit/source_code/test_notebook.py +++ b/tests/unit/source_code/test_notebook.py @@ -9,7 +9,7 @@ from databricks.labs.ucx.source_code.known import KnownList from databricks.labs.ucx.source_code.linters.files import ImportFileResolver, FileLoader from databricks.labs.ucx.source_code.linters.imports import DbutilsPyLinter -from databricks.labs.ucx.source_code.linters.python_ast import Tree +from databricks.labs.ucx.source_code.python.python_ast import Tree from databricks.labs.ucx.source_code.notebooks.sources import Notebook from databricks.labs.ucx.source_code.notebooks.loaders import ( NotebookResolver, From 472c417d13b658fd0a052656c35088aed5f707f3 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 2 Sep 2024 20:19:50 +0200 Subject: [PATCH 13/80] move PythnCodeAnalyzer to dedicated file --- .../labs/ucx/source_code/notebooks/cells.py | 265 ++--------------- .../labs/ucx/source_code/notebooks/sources.py | 2 +- .../ucx/source_code/python/python_analyzer.py | 275 ++++++++++++++++++ .../linters/test_python_imports.py | 2 +- .../unit/source_code/notebooks/test_cells.py | 2 +- 5 files changed, 300 insertions(+), 246 deletions(-) create mode 100644 src/databricks/labs/ucx/source_code/python/python_analyzer.py diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py index dc0d90870a..58e5bd763e 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/cells.py +++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py @@ -5,13 +5,10 @@ import shlex from abc import ABC, abstractmethod from ast import parse as parse_python -from collections.abc import Callable, Iterable from enum import Enum from pathlib import Path -from typing import TypeVar, cast -from astroid import Call, Const, ImportFrom, Name, NodeNG, Try # type: ignore -from astroid.exceptions import AstroidSyntaxError # type: ignore +from astroid import NodeNG # type: ignore from sqlglot import parse as parse_sql, ParseError as SQLParseError from databricks.sdk.service.workspace import Language @@ -23,14 +20,12 @@ DependencyGraphContext, InheritedContext, ) -from databricks.labs.ucx.source_code.linters.imports import ( - SysPathChange, - DbutilsPyLinter, - ImportSource, - NotebookRunCall, - UnresolvedPath, +from databricks.labs.ucx.source_code.python.python_analyzer import ( + PythonCodeAnalyzer, + MagicCommand, + MagicNode, + register_magic_command_factory, ) -from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase # use a specific logger for sqlglot warnings so we can disable them selectively sqlglot_logger = logging.getLogger(f"{__name__}.sqlglot") @@ -403,242 +398,14 @@ def wrap_with_magic(self, code: str, cell_language: CellLanguage) -> str: return "\n".join(lines) -class PythonCodeAnalyzer: - - def __init__(self, context: DependencyGraphContext, python_code: str): - self._context = context - self._python_code = python_code - - def build_graph(self) -> list[DependencyProblem]: - """Check python code for dependency-related problems. - - Returns: - A list of dependency problems; position information is relative to the python code itself. - """ - problems: list[DependencyProblem] = [] - try: - _, nodes, parse_problems = self._parse_and_extract_nodes() - problems.extend(parse_problems) - except AstroidSyntaxError as e: - logger.debug(f"Could not parse Python code: {self._python_code}", exc_info=True) - problems.append(DependencyProblem('parse-error', f"Could not parse Python code: {e}")) - return problems - for base_node in nodes: - for problem in self._build_graph_from_node(base_node): - # Astroid line numbers are 1-based. - problem = problem.replace( - start_line=base_node.node.lineno - 1, - start_col=base_node.node.col_offset, - end_line=(base_node.node.end_lineno or 1) - 1, - end_col=base_node.node.end_col_offset or 0, - ) - problems.append(problem) - return problems - - def build_inherited_context(self, child_path: Path) -> InheritedContext: - try: - tree, nodes, _ = self._parse_and_extract_nodes() - except AstroidSyntaxError: - logger.debug(f"Could not parse Python code: {self._python_code}", exc_info=True) - return InheritedContext(None, False) - if len(nodes) == 0: - return InheritedContext(tree, False) - context = InheritedContext(Tree.new_module(), False) - last_line = -1 - for base_node in nodes: - # append nodes - node_line = base_node.node.lineno - nodes = tree.nodes_between(last_line + 1, node_line - 1) - context.tree.append_nodes(nodes) - globs = tree.globals_between(last_line + 1, node_line - 1) - context.tree.append_globals(globs) - last_line = node_line - # process node - child_context = self._build_inherited_context_from_node(base_node, child_path) - context = context.append(child_context, True) - if context.found: - return context - line_count = tree.line_count() - if last_line < line_count: - nodes = tree.nodes_between(last_line + 1, line_count) - context.tree.append_nodes(nodes) - globs = tree.globals_between(last_line + 1, line_count) - context.tree.append_globals(globs) - return context - - def _parse_and_extract_nodes(self) -> tuple[Tree, list[NodeBase], Iterable[DependencyProblem]]: - problems: list[DependencyProblem] = [] - tree = Tree.normalize_and_parse(self._python_code) - syspath_changes = SysPathChange.extract_from_tree(self._context.session_state, tree) - run_calls = DbutilsPyLinter.list_dbutils_notebook_run_calls(tree) - import_sources: list[ImportSource] - import_problems: list[DependencyProblem] - import_sources, import_problems = ImportSource.extract_from_tree(tree, DependencyProblem.from_node) - problems.extend(import_problems) - magic_lines, command_problems = MagicLine.extract_from_tree(tree, DependencyProblem.from_node) - problems.extend(command_problems) - # need to evaluate things in intertwined sequence so concat and sort them - nodes: list[NodeBase] = cast(list[NodeBase], syspath_changes + run_calls + import_sources + magic_lines) - nodes = sorted(nodes, key=lambda node: (node.node.lineno, node.node.col_offset)) - return tree, nodes, problems - - def _build_graph_from_node(self, base_node: NodeBase) -> Iterable[DependencyProblem]: - if isinstance(base_node, SysPathChange): - yield from self._mutate_path_lookup(base_node) - elif isinstance(base_node, NotebookRunCall): - yield from self._register_notebook(base_node) - elif isinstance(base_node, ImportSource): - yield from self._register_import(base_node) - elif isinstance(base_node, MagicLine): - yield from base_node.build_dependency_graph(self._context.parent) - else: - logger.warning(f"Can't build graph for node {NodeBase.__name__} of type {type(base_node).__name__}") - - def _build_inherited_context_from_node(self, base_node: NodeBase, child_path: Path) -> InheritedContext: - if isinstance(base_node, SysPathChange): - self._mutate_path_lookup(base_node) - return InheritedContext(None, False) - if isinstance(base_node, ImportSource): - # nothing to do, Astroid takes care of imports - return InheritedContext(None, False) - if isinstance(base_node, NotebookRunCall): - # nothing to do, dbutils.notebook.run uses a dedicated context - return InheritedContext(None, False) - if isinstance(base_node, MagicLine): - return base_node.build_inherited_context(self._context, child_path) - logger.warning(f"Can't build inherited context for node {NodeBase.__name__} of type {type(base_node).__name__}") - return InheritedContext(None, False) - - def _register_import(self, base_node: ImportSource) -> Iterable[DependencyProblem]: - prefix = "" - if isinstance(base_node.node, ImportFrom) and base_node.node.level is not None: - prefix = "." * base_node.node.level - name = base_node.name or "" - problems = self._context.parent.register_import(prefix + name) - for problem in problems: - prob = self._filter_import_problem_in_try_except(problem, base_node) - if prob is not None: - yield prob - - @classmethod - def _filter_import_problem_in_try_except( - cls, problem: DependencyProblem, base_node: ImportSource - ) -> DependencyProblem | None: - if problem.code != 'import-not-found': - return problem - # is base_node in a try-except clause ? - node = base_node.node.parent - while node and not isinstance(node, Try): - node = node.parent - if cls._is_try_except_import_error(node): - return None - return problem - - @classmethod - def _is_try_except_import_error(cls, node: Try | None) -> bool: - if not isinstance(node, Try): - return False - for handler in node.handlers: - if isinstance(handler.type, Name): - if handler.type.name == "ImportError": - return True - return False - - def _register_notebook(self, base_node: NotebookRunCall) -> Iterable[DependencyProblem]: - has_unresolved, paths = base_node.get_notebook_paths(self._context.session_state) - if has_unresolved: - yield DependencyProblem( - 'dependency-cannot-compute-value', - f"Can't check dependency from {base_node.node.as_string()} because the expression cannot be computed", - ) - for path in paths: - # notebooks ran via dbutils.notebook.run do not inherit or propagate context - yield from self._context.parent.register_notebook(Path(path), False) - - def _mutate_path_lookup(self, change: SysPathChange) -> Iterable[DependencyProblem]: - if isinstance(change, UnresolvedPath): - yield DependencyProblem( - 'sys-path-cannot-compute-value', - f"Can't update sys.path from {change.node.as_string()} because the expression cannot be computed", - ) - return - change.apply_to(self._context.path_lookup) - - -T = TypeVar("T") - - -class MagicLine(NodeBase): - - @classmethod - def extract_from_tree( - cls, tree: Tree, problem_factory: Callable[[str, str, NodeNG], T] - ) -> tuple[list[MagicLine], list[T]]: - problems: list[T] = [] - commands: list[MagicLine] = [] - try: - nodes = tree.locate(Call, [("magic_command", Name)]) - for command in cls._make_commands_for_magic_command_call_nodes(nodes): - commands.append(command) - except Exception as e: # pylint: disable=broad-except - logger.debug(f"Internal error while checking magic commands in tree: {tree.root}", exc_info=True) - problem = problem_factory('internal-error', f"While checking magic commands: {e}", tree.root) - problems.append(problem) - return commands, problems +class RunCommand(MagicCommand): @classmethod - def _make_commands_for_magic_command_call_nodes(cls, nodes: list[Call]): - for node in nodes: - arg = node.args[0] - if isinstance(arg, Const): - yield MagicLine(node, arg.value) - - def __init__(self, node: NodeNG, command: bytes): - super().__init__(node) - self._command = command.decode() - - def as_magic(self) -> MagicCommand | None: - if self._command.startswith("%pip") or self._command.startswith("!pip"): - return PipCommand(self.node, self._command) - if self._command.startswith("%run"): - return RunCommand(self.node, self._command) + def factory(cls, command: str, node: NodeNG) -> MagicCommand | None: + if command.startswith("%run"): + return RunCommand(node, command) return None - def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: - magic = self.as_magic() - if magic is not None: - return magic.build_dependency_graph(parent) - problem = DependencyProblem.from_node( - code='unsupported-magic-line', message=f"magic line '{self._command}' is not supported yet", node=self.node - ) - return [problem] - - def build_inherited_context(self, context: DependencyGraphContext, child_path: Path) -> InheritedContext: - magic = self.as_magic() - if magic is not None: - return magic.build_inherited_context(context, child_path) - return InheritedContext(None, False) - - -class MagicNode(NodeNG): - pass - - -class MagicCommand(ABC): - - def __init__(self, node: NodeNG, code: str): - self._node = node - self._code = code - - @abstractmethod - def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: ... - - def build_inherited_context(self, _context: DependencyGraphContext, _child_path: Path) -> InheritedContext: - return InheritedContext(None, False) - - -class RunCommand(MagicCommand): - def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: path = self.notebook_path if path is not None: @@ -676,8 +443,17 @@ def build_inherited_context(self, context: DependencyGraphContext, child_path: P return container.build_inherited_context(context.parent, child_path) +register_magic_command_factory(RunCommand.factory) + + class PipCommand(MagicCommand): + @classmethod + def factory(cls, command: str, node: NodeNG) -> MagicCommand | None: + if command.startswith("%pip") or command.startswith("!pip"): + return PipCommand(node, command) + return None + def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: argv = self._split(self._code) if len(argv) == 1: @@ -716,3 +492,6 @@ def _split(cls, code: str) -> list[str]: code = code.replace("\\\n", " ") lexer = shlex.split(code, posix=True) return list(lexer) + + +register_magic_command_factory(PipCommand.factory) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index c2cffde9c4..ab7a51cf3c 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -35,6 +35,7 @@ SysPathChange, UnresolvedPath, ) +from databricks.labs.ucx.source_code.python.python_analyzer import MagicLine from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase from databricks.labs.ucx.source_code.notebooks.cells import ( CellLanguage, @@ -43,7 +44,6 @@ NOTEBOOK_HEADER, RunCell, PythonCell, - MagicLine, RunCommand, ) from databricks.labs.ucx.source_code.path_lookup import PathLookup diff --git a/src/databricks/labs/ucx/source_code/python/python_analyzer.py b/src/databricks/labs/ucx/source_code/python/python_analyzer.py new file mode 100644 index 0000000000..05d27709f9 --- /dev/null +++ b/src/databricks/labs/ucx/source_code/python/python_analyzer.py @@ -0,0 +1,275 @@ +from __future__ import annotations + +import logging +from abc import ABC, abstractmethod +from collections.abc import Iterable, Callable +from pathlib import Path +from typing import cast, TypeVar + +from astroid import AstroidSyntaxError, Call, Const, ImportFrom, NodeNG, Try, Name # type: ignore + +from databricks.labs.ucx.source_code.graph import ( + DependencyGraphContext, + DependencyProblem, + InheritedContext, + DependencyGraph, +) +from databricks.labs.ucx.source_code.linters.imports import ( + SysPathChange, + DbutilsPyLinter, + ImportSource, + NotebookRunCall, + UnresolvedPath, +) +from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase + +logger = logging.getLogger(__name__) + + +class PythonCodeAnalyzer: + + def __init__(self, context: DependencyGraphContext, python_code: str): + self._context = context + self._python_code = python_code + + def build_graph(self) -> list[DependencyProblem]: + """Check python code for dependency-related problems. + + Returns: + A list of dependency problems; position information is relative to the python code itself. + """ + problems: list[DependencyProblem] = [] + try: + _, nodes, parse_problems = self._parse_and_extract_nodes() + problems.extend(parse_problems) + except AstroidSyntaxError as e: + logger.debug(f"Could not parse Python code: {self._python_code}", exc_info=True) + problems.append(DependencyProblem('parse-error', f"Could not parse Python code: {e}")) + return problems + for base_node in nodes: + for problem in self._build_graph_from_node(base_node): + # Astroid line numbers are 1-based. + problem = problem.replace( + start_line=base_node.node.lineno - 1, + start_col=base_node.node.col_offset, + end_line=(base_node.node.end_lineno or 1) - 1, + end_col=base_node.node.end_col_offset or 0, + ) + problems.append(problem) + return problems + + def build_inherited_context(self, child_path: Path) -> InheritedContext: + try: + tree, nodes, _ = self._parse_and_extract_nodes() + except AstroidSyntaxError: + logger.debug(f"Could not parse Python code: {self._python_code}", exc_info=True) + return InheritedContext(None, False) + if len(nodes) == 0: + return InheritedContext(tree, False) + context = InheritedContext(Tree.new_module(), False) + last_line = -1 + for base_node in nodes: + # append nodes + node_line = base_node.node.lineno + nodes = tree.nodes_between(last_line + 1, node_line - 1) + context.tree.append_nodes(nodes) + globs = tree.globals_between(last_line + 1, node_line - 1) + context.tree.append_globals(globs) + last_line = node_line + # process node + child_context = self._build_inherited_context_from_node(base_node, child_path) + context = context.append(child_context, True) + if context.found: + return context + line_count = tree.line_count() + if last_line < line_count: + nodes = tree.nodes_between(last_line + 1, line_count) + context.tree.append_nodes(nodes) + globs = tree.globals_between(last_line + 1, line_count) + context.tree.append_globals(globs) + return context + + def _build_full_tree(self, inherited_context: Tree | None) -> Tree: + full_tree = Tree.new_module() + if inherited_context is not None: + full_tree = full_tree.append_tree(inherited_context) + full_tree = full_tree.renumber(-1) + tree = Tree.normalize_and_parse(self._python_code) + return full_tree.append_tree(tree) + + def _parse_and_extract_nodes(self) -> tuple[Tree, list[NodeBase], Iterable[DependencyProblem]]: + problems: list[DependencyProblem] = [] + tree = Tree.normalize_and_parse(self._python_code) + syspath_changes = SysPathChange.extract_from_tree(self._context.session_state, tree) + run_calls = DbutilsPyLinter.list_dbutils_notebook_run_calls(tree) + import_sources: list[ImportSource] + import_problems: list[DependencyProblem] + import_sources, import_problems = ImportSource.extract_from_tree(tree, DependencyProblem.from_node) + problems.extend(import_problems) + magic_lines, command_problems = MagicLine.extract_from_tree(tree, DependencyProblem.from_node) + problems.extend(command_problems) + # need to evaluate things in intertwined sequence so concat and sort them + nodes: list[NodeBase] = cast(list[NodeBase], syspath_changes + run_calls + import_sources + magic_lines) + nodes = sorted(nodes, key=lambda node: (node.node.lineno, node.node.col_offset)) + return tree, nodes, problems + + def _build_graph_from_node(self, base_node: NodeBase) -> Iterable[DependencyProblem]: + if isinstance(base_node, SysPathChange): + yield from self._mutate_path_lookup(base_node) + elif isinstance(base_node, NotebookRunCall): + yield from self._register_notebook(base_node) + elif isinstance(base_node, ImportSource): + yield from self._register_import(base_node) + elif isinstance(base_node, MagicLine): + yield from base_node.build_dependency_graph(self._context.parent) + else: + logger.warning(f"Can't build graph for node {NodeBase.__name__} of type {type(base_node).__name__}") + + def _build_inherited_context_from_node(self, base_node: NodeBase, child_path: Path) -> InheritedContext: + if isinstance(base_node, SysPathChange): + self._mutate_path_lookup(base_node) + return InheritedContext(None, False) + if isinstance(base_node, ImportSource): + # nothing to do, Astroid takes care of imports + return InheritedContext(None, False) + if isinstance(base_node, NotebookRunCall): + # nothing to do, dbutils.notebook.run uses a dedicated context + return InheritedContext(None, False) + if isinstance(base_node, MagicLine): + return base_node.build_inherited_context(self._context, child_path) + logger.warning(f"Can't build inherited context for node {NodeBase.__name__} of type {type(base_node).__name__}") + return InheritedContext(None, False) + + def _register_import(self, base_node: ImportSource) -> Iterable[DependencyProblem]: + prefix = "" + if isinstance(base_node.node, ImportFrom) and base_node.node.level is not None: + prefix = "." * base_node.node.level + name = base_node.name or "" + problems = self._context.parent.register_import(prefix + name) + for problem in problems: + prob = self._filter_import_problem_in_try_except(problem, base_node) + if prob is not None: + yield prob + + @classmethod + def _filter_import_problem_in_try_except( + cls, problem: DependencyProblem, base_node: ImportSource + ) -> DependencyProblem | None: + if problem.code != 'import-not-found': + return problem + # is base_node in a try-except clause ? + node = base_node.node.parent + while node and not isinstance(node, Try): + node = node.parent + if cls._is_try_except_import_error(node): + return None + return problem + + @classmethod + def _is_try_except_import_error(cls, node: Try | None) -> bool: + if not isinstance(node, Try): + return False + for handler in node.handlers: + if isinstance(handler.type, Name): + if handler.type.name == "ImportError": + return True + return False + + def _register_notebook(self, base_node: NotebookRunCall) -> Iterable[DependencyProblem]: + has_unresolved, paths = base_node.get_notebook_paths(self._context.session_state) + if has_unresolved: + yield DependencyProblem( + 'dependency-cannot-compute-value', + f"Can't check dependency from {base_node.node.as_string()} because the expression cannot be computed", + ) + for path in paths: + # notebooks ran via dbutils.notebook.run do not inherit or propagate context + yield from self._context.parent.register_notebook(Path(path), False) + + def _mutate_path_lookup(self, change: SysPathChange) -> Iterable[DependencyProblem]: + if isinstance(change, UnresolvedPath): + yield DependencyProblem( + 'sys-path-cannot-compute-value', + f"Can't update sys.path from {change.node.as_string()} because the expression cannot be computed", + ) + return + change.apply_to(self._context.path_lookup) + + +T = TypeVar("T") + + +class MagicLine(NodeBase): + + @classmethod + def extract_from_tree( + cls, tree: Tree, problem_factory: Callable[[str, str, NodeNG], T] + ) -> tuple[list[MagicLine], list[T]]: + problems: list[T] = [] + commands: list[MagicLine] = [] + try: + nodes = tree.locate(Call, [("magic_command", Name)]) + for command in cls._make_commands_for_magic_command_call_nodes(nodes): + commands.append(command) + except Exception as e: # pylint: disable=broad-except + logger.debug(f"Internal error while checking magic commands in tree: {tree.root}", exc_info=True) + problem = problem_factory('internal-error', f"While checking magic commands: {e}", tree.root) + problems.append(problem) + return commands, problems + + @classmethod + def _make_commands_for_magic_command_call_nodes(cls, nodes: list[Call]): + for node in nodes: + arg = node.args[0] + if isinstance(arg, Const): + yield MagicLine(node, arg.value) + + def __init__(self, node: NodeNG, command: bytes): + super().__init__(node) + self._command = command.decode() + + def as_magic(self) -> MagicCommand | None: + for factory in _FACTORIES: + command = factory(self._command, self.node) + if command is not None: + return command + return None + + def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: + magic = self.as_magic() + if magic is not None: + return magic.build_dependency_graph(parent) + problem = DependencyProblem.from_node( + code='unsupported-magic-line', message=f"magic line '{self._command}' is not supported yet", node=self.node + ) + return [problem] + + def build_inherited_context(self, context: DependencyGraphContext, child_path: Path) -> InheritedContext: + magic = self.as_magic() + if magic is not None: + return magic.build_inherited_context(context, child_path) + return InheritedContext(None, False) + + +class MagicNode(NodeNG): + pass + + +class MagicCommand(ABC): + + def __init__(self, node: NodeNG, code: str): + self._node = node + self._code = code + + @abstractmethod + def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: ... + + def build_inherited_context(self, _context: DependencyGraphContext, _child_path: Path) -> InheritedContext: + return InheritedContext(None, False) + + +_FACTORIES: list[Callable[[str, NodeNG], MagicCommand | None]] = [] + + +def register_magic_command_factory(factory: Callable[[str, NodeNG], MagicCommand | None]): + _FACTORIES.append(factory) diff --git a/tests/unit/source_code/linters/test_python_imports.py b/tests/unit/source_code/linters/test_python_imports.py index 27d44a3482..c40eedff84 100644 --- a/tests/unit/source_code/linters/test_python_imports.py +++ b/tests/unit/source_code/linters/test_python_imports.py @@ -9,8 +9,8 @@ from databricks.labs.ucx.source_code.linters.files import FileLoader from databricks.labs.ucx.source_code.linters.imports import DbutilsPyLinter, ImportSource, SysPathChange +from databricks.labs.ucx.source_code.python.python_analyzer import PythonCodeAnalyzer from databricks.labs.ucx.source_code.python.python_ast import Tree -from databricks.labs.ucx.source_code.notebooks.cells import PythonCodeAnalyzer def test_linter_returns_empty_list_of_dbutils_notebook_run_calls(): diff --git a/tests/unit/source_code/notebooks/test_cells.py b/tests/unit/source_code/notebooks/test_cells.py index 779099576c..25f794f2eb 100644 --- a/tests/unit/source_code/notebooks/test_cells.py +++ b/tests/unit/source_code/notebooks/test_cells.py @@ -8,6 +8,7 @@ from databricks.labs.ucx.source_code.base import CurrentSessionState from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver, DependencyProblem from databricks.labs.ucx.source_code.linters.files import FileLoader, ImportFileResolver +from databricks.labs.ucx.source_code.python.python_analyzer import MagicLine from databricks.labs.ucx.source_code.python.python_ast import Tree from databricks.labs.ucx.source_code.notebooks.cells import ( CellLanguage, @@ -16,7 +17,6 @@ PipCommand, PythonCodeAnalyzer, ) -from databricks.labs.ucx.source_code.notebooks.cells import MagicLine from databricks.labs.ucx.source_code.notebooks.loaders import ( NotebookResolver, NotebookLoader, From 3e256be0a48c78f6aaba80e8df5f84df2f359a6f Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 3 Sep 2024 11:07:42 +0200 Subject: [PATCH 14/80] fix merge issues --- src/databricks/labs/ucx/source_code/linters/dfsa.py | 4 ++-- src/databricks/labs/ucx/source_code/linters/pyspark.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/dfsa.py b/src/databricks/labs/ucx/source_code/linters/dfsa.py index a1c131c7f8..a95ffeae97 100644 --- a/src/databricks/labs/ucx/source_code/linters/dfsa.py +++ b/src/databricks/labs/ucx/source_code/linters/dfsa.py @@ -14,8 +14,8 @@ PythonLinter, SqlLinter, ) -from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeVisitor -from databricks.labs.ucx.source_code.linters.python_infer import InferredValue +from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeVisitor +from databricks.labs.ucx.source_code.python.python_infer import InferredValue logger = logging.getLogger(__name__) diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index feb8debe1e..a537757add 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -13,6 +13,7 @@ CurrentSessionState, PythonLinter, ) +from databricks.labs.ucx.source_code.linters.dfsa import DFSA_PATTERNS from databricks.labs.ucx.source_code.python.python_infer import InferredValue from databricks.labs.ucx.source_code.queries import FromTableSqlLinter from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper From d1d251f7be6410ea8b647fa71bc4af3c2c29949f Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 2 Sep 2024 18:11:47 +0200 Subject: [PATCH 15/80] merge from stale branch --- src/databricks/labs/ucx/source_code/base.py | 27 +++++++++++++++++++ .../labs/ucx/source_code/linters/dfsa.py | 9 +------ tests/unit/source_code/test_dfsa_crawler.py | 14 ++++++++++ 3 files changed, 42 insertions(+), 8 deletions(-) create mode 100644 tests/unit/source_code/test_dfsa_crawler.py diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index e00bc86847..2c93ea8d13 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -334,3 +334,30 @@ def is_a_notebook(path: Path, content: str | None = None) -> bool: logger.warning(f"Could not read file {path}") return False return file_header == magic_header + + +@dataclass +class DFSA: + """A DFSA is a record describing a Direct File System Access""" + + UNKNOWN = "unknown" + + source_type: str + source_id: str + path: str + is_read: bool + is_write: bool + + @property + def key(self) -> str: + return f"{self.source_type}.{self.source_id}.{self.path}".lower() # TODO for now + + @property + def safe_sql_key(self) -> str: + return escape_sql_identifier(self.key) + + def __hash__(self) -> int: + return hash(self.key) + + def __eq__(self, other) -> bool: + return isinstance(other, DFSA) and self.key == other.key diff --git a/src/databricks/labs/ucx/source_code/linters/dfsa.py b/src/databricks/labs/ucx/source_code/linters/dfsa.py index a95ffeae97..4fe867b37a 100644 --- a/src/databricks/labs/ucx/source_code/linters/dfsa.py +++ b/src/databricks/labs/ucx/source_code/linters/dfsa.py @@ -12,7 +12,7 @@ Deprecation, CurrentSessionState, PythonLinter, - SqlLinter, + SqlLinter, DFSA, ) from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeVisitor from databricks.labs.ucx.source_code.python.python_infer import InferredValue @@ -56,13 +56,6 @@ def _matches_allowed_root(self, value: str): ] -@dataclass -class DFSA: - """A DFSA is a record describing a Direct File System Access""" - - path: str - - @dataclass class DFSANode: dfsa: DFSA diff --git a/tests/unit/source_code/test_dfsa_crawler.py b/tests/unit/source_code/test_dfsa_crawler.py new file mode 100644 index 0000000000..e7095eb0e1 --- /dev/null +++ b/tests/unit/source_code/test_dfsa_crawler.py @@ -0,0 +1,14 @@ +from databricks.labs.lsql.backends import MockBackend + +from databricks.labs.ucx.source_code.base import DFSA + + +def test_crawler_appends_dfsas(): + backend = MockBackend() + crawler = DfsaCrawler(backend, "schema") + for path in ("a", "b", "c"): + dfsa = DFSA(source_type="SOURCE", source_id="ID", path=path, is_read=False, is_write=False) + crawler.append(dfsa) + rows = backend.rows_written_for(crawler.full_name, "append") + assert len(rows) == 3 + From 13ea1e6d084bdf8dd4891885236563c46c58d46f Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 2 Sep 2024 18:00:21 +0200 Subject: [PATCH 16/80] more tests --- .../functional/file-access/create_cloud_files.sql | 13 +++++++++++++ .../functional/file-access/create_location.py | 13 +++++++++++++ .../functional/file-access/create_location.sql | 2 ++ .../functional/file-access/select_format.sql | 2 ++ .../functional/file-access/select_read_files.sql | 2 ++ .../file-access/spark_read_format_load.py | 3 +++ tests/unit/source_code/test_dfsa.py | 14 ++++++++++++++ 7 files changed, 49 insertions(+) create mode 100644 tests/unit/source_code/samples/functional/file-access/create_cloud_files.sql create mode 100644 tests/unit/source_code/samples/functional/file-access/create_location.py create mode 100644 tests/unit/source_code/samples/functional/file-access/create_location.sql create mode 100644 tests/unit/source_code/samples/functional/file-access/select_format.sql create mode 100644 tests/unit/source_code/samples/functional/file-access/select_read_files.sql create mode 100644 tests/unit/source_code/samples/functional/file-access/spark_read_format_load.py create mode 100644 tests/unit/source_code/test_dfsa.py diff --git a/tests/unit/source_code/samples/functional/file-access/create_cloud_files.sql b/tests/unit/source_code/samples/functional/file-access/create_cloud_files.sql new file mode 100644 index 0000000000..0f19edf368 --- /dev/null +++ b/tests/unit/source_code/samples/functional/file-access/create_cloud_files.sql @@ -0,0 +1,13 @@ +-- Databricks notebook source +CREATE OR REFRESH STREAMING LIVE TABLE pcmd_stream_bronze +COMMENT "PCMD Stream - Bronze" +AS SELECT * + FROM cloud_files( + "s3a://db-gtm-industry-solutions/data/CME/telco/PCMD", + "json", + map( + "header", "false", + "mergeSchema", "true", + "cloudFiles.inferColumnTypes", "true" + ) + ) diff --git a/tests/unit/source_code/samples/functional/file-access/create_location.py b/tests/unit/source_code/samples/functional/file-access/create_location.py new file mode 100644 index 0000000000..940640f7ae --- /dev/null +++ b/tests/unit/source_code/samples/functional/file-access/create_location.py @@ -0,0 +1,13 @@ +# Databricks notebook source + +a = 12 + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC CREATE TABLE hive_metastore.indices_historical_data.sp_500 LOCATION 's3a://db-gtm-industry-solutions/data/fsi/capm/sp_500/' + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC CREATE TABLE hive_metastore.indices_historical_data.sp_550 LOCATION 's3a://db-gtm-industry-solutions/data/fsi/capm/sp_550/' diff --git a/tests/unit/source_code/samples/functional/file-access/create_location.sql b/tests/unit/source_code/samples/functional/file-access/create_location.sql new file mode 100644 index 0000000000..4f90fd669d --- /dev/null +++ b/tests/unit/source_code/samples/functional/file-access/create_location.sql @@ -0,0 +1,2 @@ +-- Databricks notebook source +CREATE TABLE hive_metastore.indices_historical_data.sp_500 LOCATION 's3a://db-gtm-industry-solutions/data/fsi/capm/sp_500/' diff --git a/tests/unit/source_code/samples/functional/file-access/select_format.sql b/tests/unit/source_code/samples/functional/file-access/select_format.sql new file mode 100644 index 0000000000..76d91894f2 --- /dev/null +++ b/tests/unit/source_code/samples/functional/file-access/select_format.sql @@ -0,0 +1,2 @@ +-- Databricks notebook source +SELECT * FROM parquet.`hdfs://examples/src/main/resources/users.parquet` diff --git a/tests/unit/source_code/samples/functional/file-access/select_read_files.sql b/tests/unit/source_code/samples/functional/file-access/select_read_files.sql new file mode 100644 index 0000000000..e326eec5f5 --- /dev/null +++ b/tests/unit/source_code/samples/functional/file-access/select_read_files.sql @@ -0,0 +1,2 @@ +-- Databricks notebook source +SELECT * FROM read_files("s3a://db-gtm-industry-solutions/data/fsi/capm/sp_500/file.csv") LIMIT 10 diff --git a/tests/unit/source_code/samples/functional/file-access/spark_read_format_load.py b/tests/unit/source_code/samples/functional/file-access/spark_read_format_load.py new file mode 100644 index 0000000000..16e0c9b5a7 --- /dev/null +++ b/tests/unit/source_code/samples/functional/file-access/spark_read_format_load.py @@ -0,0 +1,3 @@ +# Databricks notebook source +# ucx[direct-filesystem-access:+1:0:+1:61] The use of direct filesystem references is deprecated: s3a://prefix/some_file.csv +spark.read.format("delta").load("s3a://prefix/some_file.csv") diff --git a/tests/unit/source_code/test_dfsa.py b/tests/unit/source_code/test_dfsa.py new file mode 100644 index 0000000000..18caa46ff1 --- /dev/null +++ b/tests/unit/source_code/test_dfsa.py @@ -0,0 +1,14 @@ +from databricks.labs.lsql.backends import MockBackend + +from databricks.labs.ucx.source_code.linters.dfsa import DFSA + + +def test_crawler_appends_dfsas(): + backend = MockBackend() + crawler = DfsaCrawler(backend, "schema") + for path in ("a", "b", "c"): + dfsa = DFSA(source_type="SOURCE", source_id="ID", path=path, is_read=False, is_write=False) + crawler.append(dfsa) + rows = backend.rows_written_for(crawler.full_name, "append") + assert len(rows) == 3 + From 93d194f0aa3a040f3a6cd877a7d38d017e9dff6d Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 2 Sep 2024 18:23:38 +0200 Subject: [PATCH 17/80] merge from stale branch --- src/databricks/labs/ucx/source_code/base.py | 2 ++ .../labs/ucx/source_code/dfsa_crawler.py | 19 +++++++++++++++++++ .../labs/ucx/source_code/linters/dfsa.py | 3 ++- tests/unit/source_code/test_dfsa.py | 14 -------------- tests/unit/source_code/test_dfsa_crawler.py | 2 +- 5 files changed, 24 insertions(+), 16 deletions(-) create mode 100644 src/databricks/labs/ucx/source_code/dfsa_crawler.py delete mode 100644 tests/unit/source_code/test_dfsa.py diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index 2c93ea8d13..775a72a1c6 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -15,6 +15,8 @@ from databricks.sdk.service.workspace import Language from databricks.labs.blueprint.paths import WorkspacePath + +from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.source_code.python.python_ast import Tree # Code mapping between LSP, PyLint, and our own diagnostics: diff --git a/src/databricks/labs/ucx/source_code/dfsa_crawler.py b/src/databricks/labs/ucx/source_code/dfsa_crawler.py new file mode 100644 index 0000000000..8a9a1d879c --- /dev/null +++ b/src/databricks/labs/ucx/source_code/dfsa_crawler.py @@ -0,0 +1,19 @@ +from databricks.labs.ucx.framework.crawlers import CrawlerBase +from databricks.labs.ucx.source_code.base import DFSA +from databricks.labs.lsql.backends import SqlBackend + + +class DfsaCrawler(CrawlerBase): + + def __init__(self, backend: SqlBackend, schema: str): + """ + Initializes a DFSACrawler instance. + + Args: + backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark) + schema: The schema name for the inventory persistence. + """ + super().__init__(backend, "hive_metastore", schema, "direct_file_system_access", DFSA) + + def append(self, dfsa: DFSA): + self._append_records([dfsa]) diff --git a/src/databricks/labs/ucx/source_code/linters/dfsa.py b/src/databricks/labs/ucx/source_code/linters/dfsa.py index 4fe867b37a..19e3f5fd70 100644 --- a/src/databricks/labs/ucx/source_code/linters/dfsa.py +++ b/src/databricks/labs/ucx/source_code/linters/dfsa.py @@ -12,7 +12,8 @@ Deprecation, CurrentSessionState, PythonLinter, - SqlLinter, DFSA, + SqlLinter, + DFSA, ) from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeVisitor from databricks.labs.ucx.source_code.python.python_infer import InferredValue diff --git a/tests/unit/source_code/test_dfsa.py b/tests/unit/source_code/test_dfsa.py deleted file mode 100644 index 18caa46ff1..0000000000 --- a/tests/unit/source_code/test_dfsa.py +++ /dev/null @@ -1,14 +0,0 @@ -from databricks.labs.lsql.backends import MockBackend - -from databricks.labs.ucx.source_code.linters.dfsa import DFSA - - -def test_crawler_appends_dfsas(): - backend = MockBackend() - crawler = DfsaCrawler(backend, "schema") - for path in ("a", "b", "c"): - dfsa = DFSA(source_type="SOURCE", source_id="ID", path=path, is_read=False, is_write=False) - crawler.append(dfsa) - rows = backend.rows_written_for(crawler.full_name, "append") - assert len(rows) == 3 - diff --git a/tests/unit/source_code/test_dfsa_crawler.py b/tests/unit/source_code/test_dfsa_crawler.py index e7095eb0e1..93652c4aee 100644 --- a/tests/unit/source_code/test_dfsa_crawler.py +++ b/tests/unit/source_code/test_dfsa_crawler.py @@ -1,6 +1,7 @@ from databricks.labs.lsql.backends import MockBackend from databricks.labs.ucx.source_code.base import DFSA +from databricks.labs.ucx.source_code.dfsa_crawler import DfsaCrawler def test_crawler_appends_dfsas(): @@ -11,4 +12,3 @@ def test_crawler_appends_dfsas(): crawler.append(dfsa) rows = backend.rows_written_for(crawler.full_name, "append") assert len(rows) == 3 - From f4bc0b888ec50a6f5bfb80bd087d8740c5fbd1da Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 3 Sep 2024 10:55:55 +0200 Subject: [PATCH 18/80] merge from stale branch --- .../labs/ucx/source_code/linters/dfsa.py | 21 ++- .../labs/ucx/source_code/linters/pyspark.py | 158 ++++++++++++------ 2 files changed, 116 insertions(+), 63 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/dfsa.py b/src/databricks/labs/ucx/source_code/linters/dfsa.py index 19e3f5fd70..afc0fbc5bf 100644 --- a/src/databricks/labs/ucx/source_code/linters/dfsa.py +++ b/src/databricks/labs/ucx/source_code/linters/dfsa.py @@ -69,11 +69,11 @@ class _DetectDfsaVisitor(TreeVisitor): against a list of known deprecated paths. """ - def __init__(self, session_state: CurrentSessionState, allow_spark_duplicates: bool) -> None: + def __init__(self, session_state: CurrentSessionState, prevent_spark_duplicates: bool) -> None: self._session_state = session_state self._dfsa_nodes: list[DFSANode] = [] self._reported_locations: set[tuple[int, int]] = set() - self._allow_spark_duplicates = allow_spark_duplicates + self._prevent_spark_duplicates = prevent_spark_duplicates def visit_call(self, node: Call): for arg in node.args: @@ -98,11 +98,16 @@ def _check_str_constant(self, source_node, inferred: InferredValue): if self._already_reported(source_node, inferred): return # avoid duplicate advices that are reported by SparkSqlPyLinter - if Tree(source_node).is_from_module("spark") and not self._allow_spark_duplicates: + if self._prevent_spark_duplicates and Tree(source_node).is_from_module("spark"): return value = inferred.as_string() - if any(pattern.matches(value) for pattern in DFSA_PATTERNS): - self._dfsa_nodes.append(DFSANode(DFSA(value), source_node)) + for pattern in DFSA_PATTERNS: + if not pattern.matches(value): + continue + # since we're normally filtering out spark calls, we're dealing with dfsas we know little about + # notable we don't know is_read or is_write + dfsa = DFSA(source_type=DFSA.UNKNOWN, source_id=DFSA.UNKNOWN, path=value, is_read=True, is_write=False) + self._dfsa_nodes.append(DFSANode(dfsa, source_node)) self._reported_locations.add((source_node.lineno, source_node.col_offset)) def _already_reported(self, source_node: NodeNG, inferred: InferredValue): @@ -116,9 +121,9 @@ def dfsa_nodes(self): class DfsaPyLinter(PythonLinter): - def __init__(self, session_state: CurrentSessionState, allow_spark_duplicates=False): + def __init__(self, session_state: CurrentSessionState, prevent_spark_duplicates=True): self._session_state = session_state - self._allow_spark_duplicates = allow_spark_duplicates + self._prevent_spark_duplicates = prevent_spark_duplicates @staticmethod def name() -> str: @@ -131,7 +136,7 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]: """ Lints the code looking for file system paths that are deprecated """ - visitor = _DetectDfsaVisitor(self._session_state, self._allow_spark_duplicates) + visitor = _DetectDfsaVisitor(self._session_state, self._prevent_spark_duplicates) visitor.visit(tree.node) for dfsa_node in visitor.dfsa_nodes: advisory = Deprecation.from_node( diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index a537757add..394b72164f 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -2,6 +2,7 @@ from abc import ABC, abstractmethod from collections.abc import Iterable, Iterator from dataclasses import dataclass +from typing import TypeVar from astroid import Attribute, Call, Const, InferenceError, NodeNG # type: ignore from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex @@ -12,9 +13,10 @@ Fixer, CurrentSessionState, PythonLinter, + DFSA, ) -from databricks.labs.ucx.source_code.linters.dfsa import DFSA_PATTERNS from databricks.labs.ucx.source_code.python.python_infer import InferredValue +from databricks.labs.ucx.source_code.linters.dfsa import DFSA_PATTERNS, DFSANode from databricks.labs.ucx.source_code.queries import FromTableSqlLinter from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper @@ -31,6 +33,8 @@ class Matcher(ABC): table_arg_name: str | None = None call_context: dict[str, set[str]] | None = None session_state: CurrentSessionState | None = None + is_read: bool | None = None + is_write: bool | None = None def matches(self, node: NodeNG): return ( @@ -123,6 +127,8 @@ def lint( self, from_table: FromTableSqlLinter, index: MigrationIndex, session_state: CurrentSessionState, node: Call ) -> Iterator[Advice]: table_arg = self._get_table_arg(node) + if table_arg is None: + return table_name = table_arg.as_string().strip("'").strip('"') for inferred in InferredValue.infer_from_node(table_arg, session_state): if not inferred.is_inferred(): @@ -181,6 +187,9 @@ def apply(self, from_table: FromTableSqlLinter, index: MigrationIndex, node: Cal return +T = TypeVar("T") + + @dataclass class DirectFilesystemAccessMatcher(Matcher): @@ -195,18 +204,34 @@ def matches(self, node: NodeNG): def lint( self, from_table: FromTableSqlLinter, index: MigrationIndex, session_state: CurrentSessionState, node: NodeNG ) -> Iterator[Advice]: - table_arg = self._get_table_arg(node) - for inferred in InferredValue.infer_from_node(table_arg): + + for dfsa_node in self._for_table_arg(node): + yield Deprecation.from_node( + code='direct-filesystem-access', + message=f"The use of direct filesystem references is deprecated: {dfsa_node.dfsa.path}", + node=dfsa_node.node, + ) + + def _for_table_arg(self, node: NodeNG) -> Iterable[DFSANode]: + if not isinstance(node, Call): + return + table_arg_node = self._get_table_arg(node) + for inferred in InferredValue.infer_from_node(table_arg_node): if not inferred.is_inferred(): - logger.debug(f"Could not infer value of {table_arg.as_string()}") continue - value = inferred.as_string() - if any(pattern.matches(value) for pattern in DFSA_PATTERNS): - yield Deprecation.from_node( - code='direct-filesystem-access', - message=f"The use of direct filesystem references is deprecated: {value}", - node=node, + table_arg = inferred.as_string() + if not table_arg: + continue + if any(pattern.matches(table_arg) for pattern in DFSA_PATTERNS): + dfsa = DFSA( + source_type=DFSA.UNKNOWN, + source_id=DFSA.UNKNOWN, + path=table_arg, + is_read=self.is_read or False, + is_write=self.is_write or False, ) + yield DFSANode(dfsa, node) + continue def apply(self, from_table: FromTableSqlLinter, index: MigrationIndex, node: Call) -> None: # No transformations to apply @@ -215,12 +240,64 @@ def apply(self, from_table: FromTableSqlLinter, index: MigrationIndex, node: Cal class SparkMatchers: - def __init__(self): + def __init__(self, dfsa_matchers_only: bool): + + spark_dfsa_matchers: list[Matcher] = [ + DirectFilesystemAccessMatcher( + "ls", 1, 1, 0, call_context={"ls": {"dbutils.fs.ls"}}, is_read=True, is_write=False + ), + DirectFilesystemAccessMatcher( + "cp", 1, 2, 0, call_context={"cp": {"dbutils.fs.cp"}}, is_read=True, is_write=True + ), + DirectFilesystemAccessMatcher("rm", 1, 1, 0, call_context={"rm": {"dbutils.fs.rm"}}, is_write=True), + DirectFilesystemAccessMatcher( + "head", 1, 1, 0, call_context={"head": {"dbutils.fs.head"}}, is_read=True, is_write=False + ), + DirectFilesystemAccessMatcher( + "put", 1, 2, 0, call_context={"put": {"dbutils.fs.put"}}, is_read=False, is_write=True + ), + DirectFilesystemAccessMatcher( + "mkdirs", 1, 1, 0, call_context={"mkdirs": {"dbutils.fs.mkdirs"}}, is_read=False, is_write=True + ), + DirectFilesystemAccessMatcher( + "mv", 1, 2, 0, call_context={"mv": {"dbutils.fs.mv"}}, is_read=False, is_write=True + ), + DirectFilesystemAccessMatcher("text", 1, 3, 0, is_read=True, is_write=False), + DirectFilesystemAccessMatcher("csv", 1, 1000, 0, is_read=True, is_write=False), + DirectFilesystemAccessMatcher("json", 1, 1000, 0, is_read=True, is_write=False), + DirectFilesystemAccessMatcher("orc", 1, 1000, 0, is_read=True, is_write=False), + DirectFilesystemAccessMatcher("parquet", 1, 1000, 0, is_read=True, is_write=False), + DirectFilesystemAccessMatcher("save", 0, 1000, 0, "path", is_read=False, is_write=True), + DirectFilesystemAccessMatcher("load", 0, 1000, 0, "path", is_read=True, is_write=False), + DirectFilesystemAccessMatcher( + "option", 1, 1000, 1, is_read=True, is_write=False + ), # Only .option("path", "xxx://bucket/path") will hit + DirectFilesystemAccessMatcher("addFile", 1, 3, 0, is_read=False, is_write=True), + DirectFilesystemAccessMatcher("binaryFiles", 1, 2, 0, is_read=True, is_write=False), + DirectFilesystemAccessMatcher("binaryRecords", 1, 2, 0, is_read=True, is_write=False), + DirectFilesystemAccessMatcher("dump_profiles", 1, 1, 0, is_read=True, is_write=False), + DirectFilesystemAccessMatcher("hadoopFile", 1, 8, 0, is_read=True, is_write=False), + DirectFilesystemAccessMatcher("newAPIHadoopFile", 1, 8, 0, is_read=True, is_write=False), + DirectFilesystemAccessMatcher("pickleFile", 1, 3, 0, is_read=True, is_write=False), + DirectFilesystemAccessMatcher("saveAsHadoopFile", 1, 8, 0, is_read=False, is_write=True), + DirectFilesystemAccessMatcher("saveAsNewAPIHadoopFile", 1, 7, 0, is_read=False, is_write=True), + DirectFilesystemAccessMatcher("saveAsPickleFile", 1, 2, 0, is_read=False, is_write=True), + DirectFilesystemAccessMatcher("saveAsSequenceFile", 1, 2, 0, is_read=False, is_write=True), + DirectFilesystemAccessMatcher("saveAsTextFile", 1, 2, 0, is_read=False, is_write=True), + DirectFilesystemAccessMatcher("load_from_path", 1, 1, 0, is_read=True, is_write=False), + ] + if dfsa_matchers_only: + self._make_matchers(spark_dfsa_matchers) + return + # see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.SparkSession.html - spark_session_matchers = [QueryMatcher("sql", 1, 1000, 0, "sqlQuery"), TableNameMatcher("table", 1, 1, 0)] + spark_session_matchers: list[Matcher] = [ + QueryMatcher("sql", 1, 1000, 0, "sqlQuery"), + TableNameMatcher("table", 1, 1, 0), + ] # see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.Catalog.html - spark_catalog_matchers = [ + spark_catalog_matchers: list[Matcher] = [ TableNameMatcher("cacheTable", 1, 2, 0, "tableName"), TableNameMatcher("createTable", 1, 1000, 0, "tableName"), TableNameMatcher("createExternalTable", 1, 1000, 0, "tableName"), @@ -235,7 +312,7 @@ def __init__(self): ] # see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html - spark_dataframe_matchers = [ + spark_dataframe_matchers: list[Matcher] = [ TableNameMatcher("writeTo", 1, 1, 0), ] @@ -249,12 +326,12 @@ def __init__(self): # nothing to migrate in Window, see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.Window.html # see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameReader.html - spark_dataframereader_matchers = [ + spark_dataframereader_matchers: list[Matcher] = [ TableNameMatcher("table", 1, 1, 0), # TODO good example of collision, see spark_session_calls ] # see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.html - spark_dataframewriter_matchers = [ + spark_dataframewriter_matchers: list[Matcher] = [ TableNameMatcher("insertInto", 1, 2, 0, "tableName"), # TODO jdbc: could the url be a databricks url, raise warning ? TableNameMatcher("saveAsTable", 1, 4, 0, "name"), @@ -263,48 +340,20 @@ def __init__(self): # nothing to migrate in DataFrameWriterV2, see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriterV2.html # nothing to migrate in UDFRegistration, see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.UDFRegistration.html - direct_fs_access_matchers = [ - DirectFilesystemAccessMatcher("ls", 1, 1, 0, call_context={"ls": {"dbutils.fs.ls"}}), - DirectFilesystemAccessMatcher("cp", 1, 2, 0, call_context={"cp": {"dbutils.fs.cp"}}), - DirectFilesystemAccessMatcher("rm", 1, 1, 0, call_context={"rm": {"dbutils.fs.rm"}}), - DirectFilesystemAccessMatcher("head", 1, 1, 0, call_context={"head": {"dbutils.fs.head"}}), - DirectFilesystemAccessMatcher("put", 1, 2, 0, call_context={"put": {"dbutils.fs.put"}}), - DirectFilesystemAccessMatcher("mkdirs", 1, 1, 0, call_context={"mkdirs": {"dbutils.fs.mkdirs"}}), - DirectFilesystemAccessMatcher("mv", 1, 2, 0, call_context={"mv": {"dbutils.fs.mv"}}), - DirectFilesystemAccessMatcher("text", 1, 3, 0), - DirectFilesystemAccessMatcher("csv", 1, 1000, 0), - DirectFilesystemAccessMatcher("json", 1, 1000, 0), - DirectFilesystemAccessMatcher("orc", 1, 1000, 0), - DirectFilesystemAccessMatcher("parquet", 1, 1000, 0), - DirectFilesystemAccessMatcher("save", 0, 1000, 0, "path"), - DirectFilesystemAccessMatcher("load", 0, 1000, 0, "path"), - DirectFilesystemAccessMatcher("option", 1, 1000, 1), # Only .option("path", "xxx://bucket/path") will hit - DirectFilesystemAccessMatcher("addFile", 1, 3, 0), - DirectFilesystemAccessMatcher("binaryFiles", 1, 2, 0), - DirectFilesystemAccessMatcher("binaryRecords", 1, 2, 0), - DirectFilesystemAccessMatcher("dump_profiles", 1, 1, 0), - DirectFilesystemAccessMatcher("hadoopFile", 1, 8, 0), - DirectFilesystemAccessMatcher("newAPIHadoopFile", 1, 8, 0), - DirectFilesystemAccessMatcher("pickleFile", 1, 3, 0), - DirectFilesystemAccessMatcher("saveAsHadoopFile", 1, 8, 0), - DirectFilesystemAccessMatcher("saveAsNewAPIHadoopFile", 1, 7, 0), - DirectFilesystemAccessMatcher("saveAsPickleFile", 1, 2, 0), - DirectFilesystemAccessMatcher("saveAsSequenceFile", 1, 2, 0), - DirectFilesystemAccessMatcher("saveAsTextFile", 1, 2, 0), - DirectFilesystemAccessMatcher("load_from_path", 1, 1, 0), - ] - # nothing to migrate in UserDefinedFunction, see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.UserDefinedFunction.html # nothing to migrate in UserDefinedTableFunction, see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.UserDefinedTableFunction.html - self._matchers = {} - for matcher in ( - spark_session_matchers + self._make_matchers( + spark_dfsa_matchers + + spark_session_matchers + spark_catalog_matchers + spark_dataframe_matchers + spark_dataframereader_matchers + spark_dataframewriter_matchers - + direct_fs_access_matchers - ): + ) + + def _make_matchers(self, matchers: list[Matcher]): + self._matchers = {} + for matcher in matchers: self._matchers[matcher.method_name] = matcher @property @@ -314,12 +363,11 @@ def matchers(self): class SparkSqlPyLinter(PythonLinter, Fixer): - _spark_matchers = SparkMatchers() - def __init__(self, from_table: FromTableSqlLinter, index: MigrationIndex, session_state): self._from_table = from_table self._index = index self._session_state = session_state + self._spark_matchers = SparkMatchers(False).matchers def name(self) -> str: # this is the same fixer, just in a different language context @@ -349,7 +397,7 @@ def _find_matcher(self, node: NodeNG): return None if not isinstance(node.func, Attribute): return None - matcher = self._spark_matchers.matchers.get(node.func.attrname, None) + matcher = self._spark_matchers.get(node.func.attrname, None) if matcher is None: return None return matcher if matcher.matches(node) else None From 7ae9adc76745b000a2c01bc9efcd9a9c781c6b84 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 3 Sep 2024 11:40:47 +0200 Subject: [PATCH 19/80] fix failing tests --- src/databricks/labs/ucx/source_code/linters/dfsa.py | 2 +- tests/unit/source_code/linters/test_dfsa.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/dfsa.py b/src/databricks/labs/ucx/source_code/linters/dfsa.py index afc0fbc5bf..4a54818456 100644 --- a/src/databricks/labs/ucx/source_code/linters/dfsa.py +++ b/src/databricks/labs/ucx/source_code/linters/dfsa.py @@ -52,7 +52,7 @@ def _matches_allowed_root(self, value: str): DFSAPattern("abfs:/", []), DFSAPattern("abfss:/", []), DFSAPattern("hdfs:/", []), - DFSAPattern("/mnt/", []), + # "/mnt/" is detected by the below pattern, RootPattern("/", ["Volumes/", "Workspace/", "tmp/"]), ] diff --git a/tests/unit/source_code/linters/test_dfsa.py b/tests/unit/source_code/linters/test_dfsa.py index 11cb5a1d08..db8a4ac096 100644 --- a/tests/unit/source_code/linters/test_dfsa.py +++ b/tests/unit/source_code/linters/test_dfsa.py @@ -33,7 +33,7 @@ def test_matches_dfsa_pattern(path, matches): ], ) def test_detects_dfsa_paths(code, expected): - linter = DfsaPyLinter(CurrentSessionState(), allow_spark_duplicates=True) + linter = DfsaPyLinter(CurrentSessionState(), prevent_spark_duplicates=False) advices = list(linter.lint(code)) for advice in advices: assert isinstance(advice, Advice) @@ -63,7 +63,7 @@ def test_detects_dfsa_paths(code, expected): ], ) def test_dfsa_usage_linter(code, expected): - linter = DfsaPyLinter(CurrentSessionState(), allow_spark_duplicates=True) + linter = DfsaPyLinter(CurrentSessionState(), prevent_spark_duplicates=False) advices = linter.lint(code) count = 0 for advice in advices: From fdf7a39991a2f92ae3a67814a8041e5b81fc06fd Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 3 Sep 2024 11:44:10 +0200 Subject: [PATCH 20/80] rename ctor arg --- src/databricks/labs/ucx/source_code/jobs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index feab7bc1d0..dc09d2e12c 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -419,13 +419,13 @@ class LintingWalker(DependencyGraphWalker[LocatedAdvice]): def __init__( self, graph: DependencyGraph, - linted_paths: set[Path], + walked_paths: set[Path], path_lookup: PathLookup, key: str, session_state: CurrentSessionState, migration_index: MigrationIndex, ): - super().__init__(graph, linted_paths, path_lookup) + super().__init__(graph, walked_paths, path_lookup) self._key = key self._session_state = session_state self._migration_index = migration_index From 054f847f5c9e44e3d37106bd51e69f8c6b28e572 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 3 Sep 2024 12:36:32 +0200 Subject: [PATCH 21/80] fix infinite recursion with unknown ASTs --- .../labs/ucx/source_code/python/python_ast.py | 70 ++++++++++++++----- 1 file changed, 52 insertions(+), 18 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py index 7ce7228dd9..57dc87d21e 100644 --- a/src/databricks/labs/ucx/source_code/python/python_ast.py +++ b/src/databricks/labs/ucx/source_code/python/python_ast.py @@ -141,7 +141,7 @@ def append_tree(self, tree: Tree) -> Tree: # because each node points to the correct parent (practically, the tree is now only a list of statements) return tree - def append_globals(self, globs: dict[str, list[NodeNG]]) -> None: + def append_globals(self, globs: dict[str, list[Expr]]) -> None: if not isinstance(self.node, Module): raise NotImplementedError(f"Can't append globals to {type(self.node).__name__}") self_module: Module = cast(Module, self.node) @@ -161,28 +161,56 @@ def append_nodes(self, nodes: list[NodeNG]) -> None: self_module.body.append(node) def is_from_module(self, module_name: str) -> bool: - # if this is the call's root node, check it against the required module - if isinstance(self._node, Name): - if self._node.name == module_name: - return True - root = self.root - if not isinstance(root, Module): - return False - for value in root.globals.get(self._node.name, []): - if not isinstance(value, AssignName) or not isinstance(value.parent, Assign): - continue - if Tree(value.parent.value).is_from_module(module_name): - return True + return self._is_from_module(module_name, set()) + + def _is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool: + if self._node in visited: + logger.debug(f"Recursion encountered while traversing node {self._node.as_string()}") return False - # walk up intermediate calls such as spark.range(...) + visited.add(self._node) + return self._node_is_from_module(module_name, visited) + + def _node_is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool: + if isinstance(self._node, Name): + return self._name_is_from_module(module_name, visited) if isinstance(self._node, Call): - return isinstance(self._node.func, Attribute) and Tree(self._node.func.expr).is_from_module(module_name) + return self._call_is_from_module(module_name, visited) if isinstance(self._node, Attribute): - return Tree(self._node.expr).is_from_module(module_name) + return self._attribute_is_from_module(module_name, visited) if isinstance(self._node, Const): - return Tree(self._node.parent).is_from_module(module_name) + return self._const_is_from_module(module_name, visited) + return False + + def _name_is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool: + assert isinstance(self._node, Name) + # if this is the call's root node, check it against the required module + if self._node.name == module_name: + return True + root = self.root + if not isinstance(root, Module): + return False + for value in root.globals.get(self._node.name, []): + if not isinstance(value, AssignName) or not isinstance(value.parent, Assign): + continue + if _LocalTree(value.parent.value).is_from_module_visited(module_name, visited): + return True return False + def _call_is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool: + assert isinstance(self._node, Call) + # walk up intermediate calls such as spark.range(...) + return isinstance(self._node.func, Attribute) and _LocalTree(self._node.func.expr).is_from_module_visited( + module_name, visited + ) + + def _attribute_is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool: + assert isinstance(self._node, Attribute) + return _LocalTree(self._node.expr).is_from_module_visited(module_name, visited) + + def _const_is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool: + assert isinstance(self._node, Const) + return _LocalTree(self._node.parent).is_from_module_visited(module_name, visited) + def has_global(self, name: str) -> bool: if not isinstance(self.node, Module): return False @@ -230,7 +258,7 @@ def renumber(self, start: int) -> Tree: assert start != 0 if not isinstance(self.node, Module): raise NotImplementedError(f"Can't renumber {type(self.node).__name__}") - root: Module = self.node + root: Module = cast(Module, self.node) # for now renumber in place to avoid the complexity of rebuilding the tree with clones def renumber_node(node: NodeNG, offset: int) -> None: @@ -249,6 +277,12 @@ def renumber_node(node: NodeNG, offset: int) -> None: return self +class _LocalTree(Tree): + + def is_from_module_visited(self, name: str, visited_nodes: set[NodeNG]) -> bool: + return self._is_from_module(name, visited_nodes) + + class TreeHelper(ABC): @classmethod From 9c4a5bfd8c9156eb7f61c04ba77de6d29d29d3f7 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 3 Sep 2024 12:36:32 +0200 Subject: [PATCH 22/80] fix infinite recursion with unknown ASTs --- .../labs/ucx/source_code/python/python_ast.py | 70 ++++++++++++++----- 1 file changed, 52 insertions(+), 18 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py index 7ce7228dd9..57dc87d21e 100644 --- a/src/databricks/labs/ucx/source_code/python/python_ast.py +++ b/src/databricks/labs/ucx/source_code/python/python_ast.py @@ -141,7 +141,7 @@ def append_tree(self, tree: Tree) -> Tree: # because each node points to the correct parent (practically, the tree is now only a list of statements) return tree - def append_globals(self, globs: dict[str, list[NodeNG]]) -> None: + def append_globals(self, globs: dict[str, list[Expr]]) -> None: if not isinstance(self.node, Module): raise NotImplementedError(f"Can't append globals to {type(self.node).__name__}") self_module: Module = cast(Module, self.node) @@ -161,28 +161,56 @@ def append_nodes(self, nodes: list[NodeNG]) -> None: self_module.body.append(node) def is_from_module(self, module_name: str) -> bool: - # if this is the call's root node, check it against the required module - if isinstance(self._node, Name): - if self._node.name == module_name: - return True - root = self.root - if not isinstance(root, Module): - return False - for value in root.globals.get(self._node.name, []): - if not isinstance(value, AssignName) or not isinstance(value.parent, Assign): - continue - if Tree(value.parent.value).is_from_module(module_name): - return True + return self._is_from_module(module_name, set()) + + def _is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool: + if self._node in visited: + logger.debug(f"Recursion encountered while traversing node {self._node.as_string()}") return False - # walk up intermediate calls such as spark.range(...) + visited.add(self._node) + return self._node_is_from_module(module_name, visited) + + def _node_is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool: + if isinstance(self._node, Name): + return self._name_is_from_module(module_name, visited) if isinstance(self._node, Call): - return isinstance(self._node.func, Attribute) and Tree(self._node.func.expr).is_from_module(module_name) + return self._call_is_from_module(module_name, visited) if isinstance(self._node, Attribute): - return Tree(self._node.expr).is_from_module(module_name) + return self._attribute_is_from_module(module_name, visited) if isinstance(self._node, Const): - return Tree(self._node.parent).is_from_module(module_name) + return self._const_is_from_module(module_name, visited) + return False + + def _name_is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool: + assert isinstance(self._node, Name) + # if this is the call's root node, check it against the required module + if self._node.name == module_name: + return True + root = self.root + if not isinstance(root, Module): + return False + for value in root.globals.get(self._node.name, []): + if not isinstance(value, AssignName) or not isinstance(value.parent, Assign): + continue + if _LocalTree(value.parent.value).is_from_module_visited(module_name, visited): + return True return False + def _call_is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool: + assert isinstance(self._node, Call) + # walk up intermediate calls such as spark.range(...) + return isinstance(self._node.func, Attribute) and _LocalTree(self._node.func.expr).is_from_module_visited( + module_name, visited + ) + + def _attribute_is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool: + assert isinstance(self._node, Attribute) + return _LocalTree(self._node.expr).is_from_module_visited(module_name, visited) + + def _const_is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool: + assert isinstance(self._node, Const) + return _LocalTree(self._node.parent).is_from_module_visited(module_name, visited) + def has_global(self, name: str) -> bool: if not isinstance(self.node, Module): return False @@ -230,7 +258,7 @@ def renumber(self, start: int) -> Tree: assert start != 0 if not isinstance(self.node, Module): raise NotImplementedError(f"Can't renumber {type(self.node).__name__}") - root: Module = self.node + root: Module = cast(Module, self.node) # for now renumber in place to avoid the complexity of rebuilding the tree with clones def renumber_node(node: NodeNG, offset: int) -> None: @@ -249,6 +277,12 @@ def renumber_node(node: NodeNG, offset: int) -> None: return self +class _LocalTree(Tree): + + def is_from_module_visited(self, name: str, visited_nodes: set[NodeNG]) -> bool: + return self._is_from_module(name, visited_nodes) + + class TreeHelper(ABC): @classmethod From 688985270374487279acdfdf61b38188fe02feef Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 3 Sep 2024 12:57:22 +0200 Subject: [PATCH 23/80] make register_magic_command a decorator --- .../labs/ucx/source_code/notebooks/cells.py | 17 +++++++---------- .../ucx/source_code/python/python_analyzer.py | 10 ++++++++-- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py index 58e5bd763e..bd38b35333 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/cells.py +++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py @@ -24,7 +24,7 @@ PythonCodeAnalyzer, MagicCommand, MagicNode, - register_magic_command_factory, + magic_command_factory, ) # use a specific logger for sqlglot warnings so we can disable them selectively @@ -400,8 +400,9 @@ def wrap_with_magic(self, code: str, cell_language: CellLanguage) -> str: class RunCommand(MagicCommand): - @classmethod - def factory(cls, command: str, node: NodeNG) -> MagicCommand | None: + @staticmethod + @magic_command_factory + def factory(command: str, node: NodeNG) -> MagicCommand | None: if command.startswith("%run"): return RunCommand(node, command) return None @@ -443,13 +444,11 @@ def build_inherited_context(self, context: DependencyGraphContext, child_path: P return container.build_inherited_context(context.parent, child_path) -register_magic_command_factory(RunCommand.factory) - - class PipCommand(MagicCommand): - @classmethod - def factory(cls, command: str, node: NodeNG) -> MagicCommand | None: + @staticmethod + @magic_command_factory + def factory(command: str, node: NodeNG) -> MagicCommand | None: if command.startswith("%pip") or command.startswith("!pip"): return PipCommand(node, command) return None @@ -493,5 +492,3 @@ def _split(cls, code: str) -> list[str]: lexer = shlex.split(code, posix=True) return list(lexer) - -register_magic_command_factory(PipCommand.factory) diff --git a/src/databricks/labs/ucx/source_code/python/python_analyzer.py b/src/databricks/labs/ucx/source_code/python/python_analyzer.py index 05d27709f9..78e6de34d3 100644 --- a/src/databricks/labs/ucx/source_code/python/python_analyzer.py +++ b/src/databricks/labs/ucx/source_code/python/python_analyzer.py @@ -271,5 +271,11 @@ def build_inherited_context(self, _context: DependencyGraphContext, _child_path: _FACTORIES: list[Callable[[str, NodeNG], MagicCommand | None]] = [] -def register_magic_command_factory(factory: Callable[[str, NodeNG], MagicCommand | None]): - _FACTORIES.append(factory) +def magic_command_factory(func: Callable[[str, NodeNG], MagicCommand | None]): + _FACTORIES.append(func) + + def inner(command: str, node: NodeNG) -> MagicCommand | None: + return func(command, node) + + return inner + From 779219997cf52268ebf2f772f8bfd458bed949e3 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 3 Sep 2024 12:57:55 +0200 Subject: [PATCH 24/80] formatting --- src/databricks/labs/ucx/source_code/notebooks/cells.py | 1 - src/databricks/labs/ucx/source_code/python/python_analyzer.py | 1 - 2 files changed, 2 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py index bd38b35333..a2b994972a 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/cells.py +++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py @@ -491,4 +491,3 @@ def _split(cls, code: str) -> list[str]: code = code.replace("\\\n", " ") lexer = shlex.split(code, posix=True) return list(lexer) - diff --git a/src/databricks/labs/ucx/source_code/python/python_analyzer.py b/src/databricks/labs/ucx/source_code/python/python_analyzer.py index 78e6de34d3..d2a2454b2f 100644 --- a/src/databricks/labs/ucx/source_code/python/python_analyzer.py +++ b/src/databricks/labs/ucx/source_code/python/python_analyzer.py @@ -278,4 +278,3 @@ def inner(command: str, node: NodeNG) -> MagicCommand | None: return func(command, node) return inner - From b4ba5ae3e425bf1606853f22a756bf98c441c62a Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 3 Sep 2024 17:04:12 +0200 Subject: [PATCH 25/80] integrate with WorkflowLinter --- .../labs/ucx/source_code/dfsa_crawler.py | 6 +- src/databricks/labs/ucx/source_code/jobs.py | 88 ++++++++++++++++++- .../labs/ucx/source_code/linters/dfsa.py | 82 ++++++++++++++--- tests/unit/source_code/conftest.py | 8 ++ .../functional/file-access/create_location.py | 2 + tests/unit/source_code/test_dfsa_crawler.py | 7 +- tests/unit/source_code/test_functional.py | 2 +- tests/unit/source_code/test_jobs.py | 6 +- 8 files changed, 177 insertions(+), 24 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/dfsa_crawler.py b/src/databricks/labs/ucx/source_code/dfsa_crawler.py index 8a9a1d879c..fa3ce36f8d 100644 --- a/src/databricks/labs/ucx/source_code/dfsa_crawler.py +++ b/src/databricks/labs/ucx/source_code/dfsa_crawler.py @@ -1,3 +1,5 @@ +from collections.abc import Sequence + from databricks.labs.ucx.framework.crawlers import CrawlerBase from databricks.labs.ucx.source_code.base import DFSA from databricks.labs.lsql.backends import SqlBackend @@ -15,5 +17,5 @@ def __init__(self, backend: SqlBackend, schema: str): """ super().__init__(backend, "hive_metastore", schema, "direct_file_system_access", DFSA) - def append(self, dfsa: DFSA): - self._append_records([dfsa]) + def append(self, dfsas: Sequence[DFSA]): + self._append_records(dfsas) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index dc09d2e12c..b8c534adb6 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -16,11 +16,20 @@ from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound from databricks.sdk.service import compute, jobs +from databricks.sdk.service.workspace import Language from databricks.labs.ucx.assessment.crawlers import runtime_version_tuple from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex from databricks.labs.ucx.mixins.cached_workspace_path import WorkspaceCache -from databricks.labs.ucx.source_code.base import CurrentSessionState, LocatedAdvice +from databricks.labs.ucx.source_code.base import ( + CurrentSessionState, + LocatedAdvice, + DFSA, + is_a_notebook, + file_language, + guess_encoding, +) +from databricks.labs.ucx.source_code.dfsa_crawler import DfsaCrawler from databricks.labs.ucx.source_code.graph import ( Dependency, DependencyGraph, @@ -31,8 +40,9 @@ DependencyGraphWalker, ) from databricks.labs.ucx.source_code.linters.context import LinterContext +from databricks.labs.ucx.source_code.linters.dfsa import DfsaSqlLinter, DfsaPyLinter from databricks.labs.ucx.source_code.python.python_ast import Tree -from databricks.labs.ucx.source_code.notebooks.sources import FileLinter +from databricks.labs.ucx.source_code.notebooks.sources import FileLinter, Notebook from databricks.labs.ucx.source_code.path_lookup import PathLookup logger = logging.getLogger(__name__) @@ -320,12 +330,14 @@ def __init__( resolver: DependencyResolver, path_lookup: PathLookup, migration_index: MigrationIndex, + dfsa_crawler: DfsaCrawler, include_job_ids: list[int] | None = None, ): self._ws = ws self._resolver = resolver self._path_lookup = path_lookup self._migration_index = migration_index + self._dfsa_crawler = dfsa_crawler self._include_job_ids = include_job_ids def refresh_report(self, sql_backend: SqlBackend, inventory_database: str): @@ -412,6 +424,9 @@ def _lint_task(self, task: jobs.Task, job: jobs.Job, linted_paths: set[Path]) -> graph, linted_paths, self._path_lookup, task.task_key, session_state, self._migration_index ) yield from walker + collector = DfsaCollector(graph, set(), self._path_lookup, session_state) + dfsas = list(dfsa for dfsa in collector) + self._dfsa_crawler.append(dfsas) class LintingWalker(DependencyGraphWalker[LocatedAdvice]): @@ -441,3 +456,72 @@ def _process_dependency( linter = FileLinter(ctx, path_lookup, self._session_state, dependency.path, inherited_tree) for advice in linter.lint(): yield LocatedAdvice(advice, dependency.path) + + +class DfsaCollector(DependencyGraphWalker[DFSA]): + + def __init__( + self, + graph: DependencyGraph, + walked_paths: set[Path], + path_lookup: PathLookup, + session_state: CurrentSessionState, + ): + super().__init__(graph, walked_paths, path_lookup) + self._session_state = session_state + + def _process_dependency( + self, dependency: Dependency, path_lookup: PathLookup, inherited_tree: Tree | None + ) -> Iterable[DFSA]: + language = file_language(dependency.path) + if not language: + logger.warning(f"Unknown language for {dependency.path}") + return + source = dependency.path.read_text(guess_encoding(dependency.path)) + if is_a_notebook(dependency.path): + yield from self._collect_from_notebook(source, language, dependency.path, inherited_tree) + elif dependency.path.is_file(): + yield from self._collect_from_source(source, language, dependency.path, inherited_tree) + + def _collect_from_notebook( + self, source: str, language: Language, path: Path, inherited_tree: Tree | None + ) -> Iterable[DFSA]: + notebook = Notebook.parse(path, source, language) + for cell in notebook.cells: + for dfsa in self._collect_from_source(cell.original_code, cell.language.language, path, inherited_tree): + yield DFSA( + source_type="NOTEBOOK", + source_id=str(path), + path=dfsa.path, + is_read=dfsa.is_read, + is_write=dfsa.is_write, + ) + if cell.language.language is Language.PYTHON: + if inherited_tree is None: + inherited_tree = Tree.new_module() + tree = Tree.normalize_and_parse(cell.original_code) + inherited_tree.append_tree(tree) + + def _collect_from_source( + self, source: str, language: Language, path: Path, inherited_tree: Tree | None + ) -> Iterable[DFSA]: + iterable: Iterable[DFSA] | None = None + if language is Language.SQL: + iterable = self._collect_from_sql(source) + if language is Language.PYTHON: + iterable = self._collect_from_python(source, inherited_tree) + if iterable is None: + logger.warning(f"Language {language.name} not supported yet!") + return + for dfsa in iterable: + yield DFSA( + source_type="FILE", source_id=str(path), path=dfsa.path, is_read=dfsa.is_read, is_write=dfsa.is_write + ) + + def _collect_from_python(self, source: str, inherited_tree: Tree | None) -> Iterable[DFSA]: + linter = DfsaPyLinter(self._session_state, prevent_spark_duplicates=False) + yield from linter.collect_dfsas(source, inherited_tree) + + def _collect_from_sql(self, source: str) -> Iterable[DFSA]: + linter = DfsaSqlLinter() + yield from linter.collect_dfsas(source) diff --git a/src/databricks/labs/ucx/source_code/linters/dfsa.py b/src/databricks/labs/ucx/source_code/linters/dfsa.py index 4a54818456..6fd87e41c1 100644 --- a/src/databricks/labs/ucx/source_code/linters/dfsa.py +++ b/src/databricks/labs/ucx/source_code/linters/dfsa.py @@ -4,8 +4,8 @@ from collections.abc import Iterable from astroid import Call, Const, InferenceError, NodeNG # type: ignore -from sqlglot import Expression -from sqlglot.expressions import Table +from sqlglot import Expression as SqlExpression, parse as parse_sql, ParseError as SqlParseError +from sqlglot.expressions import Alter, Create, Delete, Drop, Identifier, Insert, Literal, Select from databricks.labs.ucx.source_code.base import ( Advice, @@ -146,6 +146,15 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]: ) yield advisory + def collect_dfsas(self, python_code: str, inherited_tree: Tree | None) -> Iterable[DFSA]: + tree = Tree.new_module() + if inherited_tree: + tree.append_tree(inherited_tree) + tree.append_tree(Tree.normalize_and_parse(python_code)) + visitor = _DetectDfsaVisitor(self._session_state, self._prevent_spark_duplicates) + visitor.visit(tree.node) + yield from visitor.dfsa_nodes + class DfsaSqlLinter(SqlLinter): @@ -153,23 +162,68 @@ class DfsaSqlLinter(SqlLinter): def name() -> str: return 'dfsa-query' - def lint_expression(self, expression: Expression): - for table in expression.find_all(Table): - # Check table names for direct file system access - yield from self._check_dfsa(table) - - def _check_dfsa(self, table: Table) -> Iterable[Advice]: - """ - Check if the table is a DBFS table or reference in some way - and yield a deprecation message if it is - """ - if any(pattern.matches(table.name) for pattern in DFSA_PATTERNS): + def lint_expression(self, expression: SqlExpression): + for dfsa in self._collect_dfsas(expression): yield Deprecation( code='direct-filesystem-access-in-sql-query', - message=f"The use of direct filesystem references is deprecated: {table.name}", + message=f"The use of direct filesystem references is deprecated: {dfsa.path}", # SQLGlot does not propagate tokens yet. See https://github.com/tobymao/sqlglot/issues/3159 start_line=0, start_col=0, end_line=0, end_col=1024, ) + + def collect_dfsas(self, sql_code: str): + try: + expressions = parse_sql(sql_code, read='databricks') + for expression in expressions: + if not expression: + continue + yield from self._collect_dfsas(expression) + except SqlParseError as e: + logger.debug(f"Failed to parse SQL: {sql_code}", exc_info=e) + + @classmethod + def _collect_dfsas(cls, expression: SqlExpression) -> Iterable[DFSA]: + yield from cls._collect_dfsas_from_literals(expression) + yield from cls._collect_dfsas_from_identifiers(expression) + + @classmethod + def _collect_dfsas_from_literals(cls, expression: SqlExpression) -> Iterable[DFSA]: + for literal in expression.find_all(Literal): + if not isinstance(literal.this, str): + logger.warning(f"Can't interpret {type(literal.this).__name__}") + yield from cls._collect_dfsa_from_node(literal, literal.this) + + @classmethod + def _collect_dfsas_from_identifiers(cls, expression: SqlExpression) -> Iterable[DFSA]: + for identifier in expression.find_all(Identifier): + if not isinstance(identifier.this, str): + logger.warning(f"Can't interpret {type(identifier.this).__name__}") + yield from cls._collect_dfsa_from_node(identifier, identifier.this) + + @classmethod + def _collect_dfsa_from_node(cls, expression: SqlExpression, path: str) -> Iterable[DFSA]: + if any(pattern.matches(path) for pattern in DFSA_PATTERNS): + is_read = cls._is_read(expression) + is_write = cls._is_write(expression) + yield DFSA(source_type=DFSA.UNKNOWN, source_id=DFSA.UNKNOWN, path=path, is_read=is_read, is_write=is_write) + + @classmethod + def _is_read(cls, expression: SqlExpression | None) -> bool: + expression = cls._walk_up(expression) + return isinstance(expression, Select) + + @classmethod + def _is_write(cls, expression: SqlExpression | None) -> bool: + expression = cls._walk_up(expression) + return isinstance(expression, (Create, Alter, Drop, Insert, Delete)) + + @classmethod + def _walk_up(cls, expression: SqlExpression | None) -> SqlExpression | None: + if expression is None: + return None + if isinstance(expression, (Create, Alter, Drop, Insert, Delete, Select)): + return expression + return cls._walk_up(expression.parent) diff --git a/tests/unit/source_code/conftest.py b/tests/unit/source_code/conftest.py index df70470041..24c6020077 100644 --- a/tests/unit/source_code/conftest.py +++ b/tests/unit/source_code/conftest.py @@ -1,9 +1,12 @@ import pytest +from databricks.labs.lsql.backends import MockBackend + from databricks.labs.ucx.hive_metastore.migration_status import ( MigrationStatus, ) from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex +from databricks.labs.ucx.source_code.dfsa_crawler import DfsaCrawler from databricks.labs.ucx.source_code.graph import DependencyResolver from databricks.labs.ucx.source_code.known import KnownList from databricks.labs.ucx.source_code.linters.files import ImportFileResolver, FileLoader @@ -55,3 +58,8 @@ def simple_dependency_resolver(mock_path_lookup: PathLookup) -> DependencyResolv notebook_resolver = NotebookResolver(NotebookLoader()) import_resolver = ImportFileResolver(FileLoader(), allow_list) return DependencyResolver(library_resolver, notebook_resolver, import_resolver, import_resolver, mock_path_lookup) + + +@pytest.fixture +def mock_dfsa_crawler() -> DfsaCrawler: + return DfsaCrawler(MockBackend(), "schema") diff --git a/tests/unit/source_code/samples/functional/file-access/create_location.py b/tests/unit/source_code/samples/functional/file-access/create_location.py index 940640f7ae..178ddf4c62 100644 --- a/tests/unit/source_code/samples/functional/file-access/create_location.py +++ b/tests/unit/source_code/samples/functional/file-access/create_location.py @@ -4,10 +4,12 @@ # COMMAND ---------- +# ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: s3a://db-gtm-industry-solutions/data/fsi/capm/sp_500/ # MAGIC %sql # MAGIC CREATE TABLE hive_metastore.indices_historical_data.sp_500 LOCATION 's3a://db-gtm-industry-solutions/data/fsi/capm/sp_500/' # COMMAND ---------- +# ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: s3a://db-gtm-industry-solutions/data/fsi/capm/sp_550/ # MAGIC %sql # MAGIC CREATE TABLE hive_metastore.indices_historical_data.sp_550 LOCATION 's3a://db-gtm-industry-solutions/data/fsi/capm/sp_550/' diff --git a/tests/unit/source_code/test_dfsa_crawler.py b/tests/unit/source_code/test_dfsa_crawler.py index 93652c4aee..d442805c40 100644 --- a/tests/unit/source_code/test_dfsa_crawler.py +++ b/tests/unit/source_code/test_dfsa_crawler.py @@ -7,8 +7,9 @@ def test_crawler_appends_dfsas(): backend = MockBackend() crawler = DfsaCrawler(backend, "schema") - for path in ("a", "b", "c"): - dfsa = DFSA(source_type="SOURCE", source_id="ID", path=path, is_read=False, is_write=False) - crawler.append(dfsa) + dfsas = list( + DFSA(source_type="SOURCE", source_id="ID", path=path, is_read=False, is_write=False) for path in ("a", "b", "c") + ) + crawler.append(dfsas) rows = backend.rows_written_for(crawler.full_name, "append") assert len(rows) == 3 diff --git a/tests/unit/source_code/test_functional.py b/tests/unit/source_code/test_functional.py index ed2f6fe6dc..d4d5250058 100644 --- a/tests/unit/source_code/test_functional.py +++ b/tests/unit/source_code/test_functional.py @@ -249,7 +249,7 @@ def test_functional_with_parent( @pytest.mark.skip(reason="Used for troubleshooting failing tests") def test_one_functional(mock_path_lookup, simple_dependency_resolver, extended_test_index): - path = mock_path_lookup.resolve(Path("functional/table-migration/table-migration-notebook.sql")) + path = mock_path_lookup.resolve(Path("functional/file-access/create_location.py")) path_lookup = mock_path_lookup.change_directory(path.parent) sample = Functional(path) sample.verify(path_lookup, simple_dependency_resolver, extended_test_index) diff --git a/tests/unit/source_code/test_jobs.py b/tests/unit/source_code/test_jobs.py index 437649b8e6..e21af86cfa 100644 --- a/tests/unit/source_code/test_jobs.py +++ b/tests/unit/source_code/test_jobs.py @@ -229,11 +229,13 @@ def test_workflow_task_container_builds_dependency_graph_spark_python_task( assert registered_notebooks == [expected_path_instance] -def test_workflow_linter_lint_job_logs_problems(dependency_resolver, mock_path_lookup, empty_index, caplog): +def test_workflow_linter_lint_job_logs_problems( + dependency_resolver, mock_path_lookup, empty_index, mock_dfsa_crawler, caplog +): expected_message = "Found job problems:\nUNKNOWN:-1 [library-install-failed] 'pip --disable-pip-version-check install unknown-library" ws = create_autospec(WorkspaceClient) - linter = WorkflowLinter(ws, dependency_resolver, mock_path_lookup, empty_index) + linter = WorkflowLinter(ws, dependency_resolver, mock_path_lookup, empty_index, mock_dfsa_crawler) libraries = [compute.Library(pypi=compute.PythonPyPiLibrary(package="unknown-library-name"))] task = jobs.Task(task_key="test-task", libraries=libraries) From 2646accefed35acc2c23da2a222bc1ba3a1356dd Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 3 Sep 2024 18:27:30 +0200 Subject: [PATCH 26/80] fix failing tests --- tests/integration/source_code/test_jobs.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index 2484d15a5c..e0f49d4a2b 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -122,10 +122,8 @@ def test_job_task_linter_library_installed_cluster( def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job, make_notebook, make_random, caplog): expected_messages = { - 'second_notebook:3 [dbfs-usage] Deprecated file system path: /mnt/something', - 'second_notebook:3 [implicit-dbfs-usage] The use of default dbfs: references is deprecated: /mnt/something', - 'some_file.py:0 [dbfs-usage] Deprecated file system path: /mnt/foo/bar', - 'some_file.py:0 [implicit-dbfs-usage] The use of default dbfs: references is deprecated: /mnt/foo/bar', + 'some_file.py:0 [direct-filesystem-access] The use of direct filesystem references is deprecated: /mnt/foo/bar', + 'second_notebook:3 [direct-filesystem-access] The use of direct filesystem references is deprecated: /mnt/something', } entrypoint = WorkspacePath(ws, f"~/linter-{make_random(4)}-{get_purge_suffix()}").expanduser() @@ -149,7 +147,8 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job, with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.source_code.jobs"): problems = simple_ctx.workflow_linter.lint_job(j.job_id) - messages = {replace(p, path=Path(p.path).relative_to(entrypoint)).as_message() for p in problems} + root = Path(entrypoint.as_posix()) + messages = {replace(p, path=Path(p.path).relative_to(root)).as_message() for p in problems} assert messages == expected_messages last_messages = caplog.messages[-1].split("\n") From a5416c01fd541f0accaf7581267828f964ef4cb6 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 4 Sep 2024 11:39:36 +0200 Subject: [PATCH 27/80] finalize integration --- .../labs/ucx/contexts/application.py | 6 +++++ src/databricks/labs/ucx/source_code/jobs.py | 24 ++++++++++--------- .../labs/ucx/source_code/linters/dfsa.py | 2 +- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 3d9e45aa61..c3b40e039b 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -15,6 +15,7 @@ from databricks.labs.ucx.recon.metadata_retriever import DatabricksTableMetadataRetriever from databricks.labs.ucx.recon.migration_recon import MigrationRecon from databricks.labs.ucx.recon.schema_comparator import StandardSchemaComparator +from databricks.labs.ucx.source_code.dfsa_crawler import DfsaCrawler from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver from databricks.sdk import AccountClient, WorkspaceClient, core from databricks.sdk.errors import ResourceDoesNotExist @@ -425,9 +426,14 @@ def workflow_linter(self): self.dependency_resolver, self.path_lookup, MigrationIndex([]), # TODO: bring back self.tables_migrator.index() + self.dfsa_crawler, self.config.include_job_ids, ) + @cached_property + def dfsa_crawler(self): + return DfsaCrawler(self.sql_backend, self.inventory_database) + @cached_property def redash(self): return Redash( diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index b8c534adb6..acff9be8e7 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -16,7 +16,6 @@ from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound from databricks.sdk.service import compute, jobs -from databricks.sdk.service.workspace import Language from databricks.labs.ucx.assessment.crawlers import runtime_version_tuple from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex @@ -41,6 +40,7 @@ ) from databricks.labs.ucx.source_code.linters.context import LinterContext from databricks.labs.ucx.source_code.linters.dfsa import DfsaSqlLinter, DfsaPyLinter +from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage from databricks.labs.ucx.source_code.python.python_ast import Tree from databricks.labs.ucx.source_code.notebooks.sources import FileLinter, Notebook from databricks.labs.ucx.source_code.path_lookup import PathLookup @@ -477,18 +477,19 @@ def _process_dependency( if not language: logger.warning(f"Unknown language for {dependency.path}") return + cell_language = CellLanguage.of_language(language) source = dependency.path.read_text(guess_encoding(dependency.path)) if is_a_notebook(dependency.path): - yield from self._collect_from_notebook(source, language, dependency.path, inherited_tree) + yield from self._collect_from_notebook(source, cell_language, dependency.path, inherited_tree) elif dependency.path.is_file(): - yield from self._collect_from_source(source, language, dependency.path, inherited_tree) + yield from self._collect_from_source(source, cell_language, dependency.path, inherited_tree) def _collect_from_notebook( - self, source: str, language: Language, path: Path, inherited_tree: Tree | None + self, source: str, language: CellLanguage, path: Path, inherited_tree: Tree | None ) -> Iterable[DFSA]: - notebook = Notebook.parse(path, source, language) + notebook = Notebook.parse(path, source, language.language) for cell in notebook.cells: - for dfsa in self._collect_from_source(cell.original_code, cell.language.language, path, inherited_tree): + for dfsa in self._collect_from_source(cell.original_code, cell.language, path, inherited_tree): yield DFSA( source_type="NOTEBOOK", source_id=str(path), @@ -496,19 +497,19 @@ def _collect_from_notebook( is_read=dfsa.is_read, is_write=dfsa.is_write, ) - if cell.language.language is Language.PYTHON: + if cell.language is CellLanguage.PYTHON: if inherited_tree is None: inherited_tree = Tree.new_module() tree = Tree.normalize_and_parse(cell.original_code) inherited_tree.append_tree(tree) def _collect_from_source( - self, source: str, language: Language, path: Path, inherited_tree: Tree | None + self, source: str, language: CellLanguage, path: Path, inherited_tree: Tree | None ) -> Iterable[DFSA]: iterable: Iterable[DFSA] | None = None - if language is Language.SQL: + if language is CellLanguage.SQL: iterable = self._collect_from_sql(source) - if language is Language.PYTHON: + if language is CellLanguage.PYTHON: iterable = self._collect_from_python(source, inherited_tree) if iterable is None: logger.warning(f"Language {language.name} not supported yet!") @@ -520,7 +521,8 @@ def _collect_from_source( def _collect_from_python(self, source: str, inherited_tree: Tree | None) -> Iterable[DFSA]: linter = DfsaPyLinter(self._session_state, prevent_spark_duplicates=False) - yield from linter.collect_dfsas(source, inherited_tree) + for dfsa_node in linter.collect_dfsas(source, inherited_tree): + yield dfsa_node.dfsa def _collect_from_sql(self, source: str) -> Iterable[DFSA]: linter = DfsaSqlLinter() diff --git a/src/databricks/labs/ucx/source_code/linters/dfsa.py b/src/databricks/labs/ucx/source_code/linters/dfsa.py index 6fd87e41c1..9bd7caa6ff 100644 --- a/src/databricks/labs/ucx/source_code/linters/dfsa.py +++ b/src/databricks/labs/ucx/source_code/linters/dfsa.py @@ -146,7 +146,7 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]: ) yield advisory - def collect_dfsas(self, python_code: str, inherited_tree: Tree | None) -> Iterable[DFSA]: + def collect_dfsas(self, python_code: str, inherited_tree: Tree | None) -> Iterable[DFSANode]: tree = Tree.new_module() if inherited_tree: tree.append_tree(inherited_tree) From bba91c9db3c1a8e31aa2f6ff19545f34cf6b2cb8 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 4 Sep 2024 11:59:28 +0200 Subject: [PATCH 28/80] add logs --- src/databricks/labs/ucx/source_code/dfsa_crawler.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/source_code/dfsa_crawler.py b/src/databricks/labs/ucx/source_code/dfsa_crawler.py index fa3ce36f8d..f2b4109b3b 100644 --- a/src/databricks/labs/ucx/source_code/dfsa_crawler.py +++ b/src/databricks/labs/ucx/source_code/dfsa_crawler.py @@ -1,9 +1,12 @@ +import logging from collections.abc import Sequence from databricks.labs.ucx.framework.crawlers import CrawlerBase from databricks.labs.ucx.source_code.base import DFSA from databricks.labs.lsql.backends import SqlBackend +from databricks.sdk.errors import DatabricksError +logger = logging.getLogger(__name__) class DfsaCrawler(CrawlerBase): @@ -18,4 +21,8 @@ def __init__(self, backend: SqlBackend, schema: str): super().__init__(backend, "hive_metastore", schema, "direct_file_system_access", DFSA) def append(self, dfsas: Sequence[DFSA]): - self._append_records(dfsas) + try: + self._append_records(dfsas) + except DatabricksError as e: + logger.error("Failed to store DFSAs", exc_info=e) + From 2da934e79cc1b08c3fc0aae95fd1385758b9d58a Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 4 Sep 2024 12:18:05 +0200 Subject: [PATCH 29/80] enhance integration test or checking stored DFSAs --- src/databricks/labs/ucx/source_code/dfsa_crawler.py | 6 +++++- tests/integration/source_code/test_jobs.py | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/source_code/dfsa_crawler.py b/src/databricks/labs/ucx/source_code/dfsa_crawler.py index f2b4109b3b..e4abaa81c7 100644 --- a/src/databricks/labs/ucx/source_code/dfsa_crawler.py +++ b/src/databricks/labs/ucx/source_code/dfsa_crawler.py @@ -1,5 +1,5 @@ import logging -from collections.abc import Sequence +from collections.abc import Sequence, Iterable from databricks.labs.ucx.framework.crawlers import CrawlerBase from databricks.labs.ucx.source_code.base import DFSA @@ -26,3 +26,7 @@ def append(self, dfsas: Sequence[DFSA]): except DatabricksError as e: logger.error("Failed to store DFSAs", exc_info=e) + def snapshot(self) -> Iterable[DFSA]: + sql = f"SELECT * FROM {self.full_name}" + yield from self._backend.fetch(sql) + diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index e0f49d4a2b..6a4bd9d6ab 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -154,6 +154,9 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job, last_messages = caplog.messages[-1].split("\n") assert all(any(message.endswith(expected) for message in last_messages) for expected in expected_messages) + dfsas = simple_ctx.dfsa_crawler.snapshot() + assert len(list(dfsas)) == 2 + def test_workflow_linter_lints_job_with_import_pypi_library( simple_ctx, From 5c444ffef0ac7323cb56de0f7e8d9ac86383d366 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 4 Sep 2024 16:24:36 +0200 Subject: [PATCH 30/80] move 'magic'-related stuff to dedicated file --- .../labs/ucx/source_code/notebooks/cells.py | 8 +- .../labs/ucx/source_code/notebooks/magic.py | 103 ++++++++++++++++++ .../labs/ucx/source_code/notebooks/sources.py | 2 +- .../ucx/source_code/python/python_analyzer.py | 93 +--------------- .../unit/source_code/notebooks/test_cells.py | 2 +- 5 files changed, 111 insertions(+), 97 deletions(-) create mode 100644 src/databricks/labs/ucx/source_code/notebooks/magic.py diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py index a2b994972a..0447d4c513 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/cells.py +++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py @@ -20,12 +20,8 @@ DependencyGraphContext, InheritedContext, ) -from databricks.labs.ucx.source_code.python.python_analyzer import ( - PythonCodeAnalyzer, - MagicCommand, - MagicNode, - magic_command_factory, -) +from databricks.labs.ucx.source_code.python.python_analyzer import PythonCodeAnalyzer +from databricks.labs.ucx.source_code.notebooks.magic import MagicNode, MagicCommand, magic_command_factory # use a specific logger for sqlglot warnings so we can disable them selectively sqlglot_logger = logging.getLogger(f"{__name__}.sqlglot") diff --git a/src/databricks/labs/ucx/source_code/notebooks/magic.py b/src/databricks/labs/ucx/source_code/notebooks/magic.py new file mode 100644 index 0000000000..fd728faa8b --- /dev/null +++ b/src/databricks/labs/ucx/source_code/notebooks/magic.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +import logging +from abc import ABC, abstractmethod +from collections.abc import Callable +from pathlib import Path +from typing import TypeVar + +from astroid import NodeNG, Call, Name, Const # type: ignore + +from databricks.labs.ucx.source_code.graph import ( + DependencyGraph, + DependencyProblem, + DependencyGraphContext, + InheritedContext, +) +from databricks.labs.ucx.source_code.python.python_ast import NodeBase, Tree + + +logger = logging.getLogger(__name__) + +T = TypeVar("T") + + +class MagicLine(NodeBase): + + @classmethod + def extract_from_tree( + cls, tree: Tree, problem_factory: Callable[[str, str, NodeNG], T] + ) -> tuple[list[MagicLine], list[T]]: + problems: list[T] = [] + commands: list[MagicLine] = [] + try: + nodes = tree.locate(Call, [("magic_command", Name)]) + for command in cls._make_commands_for_magic_command_call_nodes(nodes): + commands.append(command) + except Exception as e: # pylint: disable=broad-except + logger.debug(f"Internal error while checking magic commands in tree: {tree.root}", exc_info=True) + problem = problem_factory('internal-error', f"While checking magic commands: {e}", tree.root) + problems.append(problem) + return commands, problems + + @classmethod + def _make_commands_for_magic_command_call_nodes(cls, nodes: list[Call]): + for node in nodes: + arg = node.args[0] + if isinstance(arg, Const): + yield MagicLine(node, arg.value) + + def __init__(self, node: NodeNG, command: bytes): + super().__init__(node) + self._command = command.decode() + + def as_magic(self) -> MagicCommand | None: + for factory in _FACTORIES: + command = factory(self._command, self.node) + if command is not None: + return command + return None + + def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: + magic = self.as_magic() + if magic is not None: + return magic.build_dependency_graph(parent) + problem = DependencyProblem.from_node( + code='unsupported-magic-line', message=f"magic line '{self._command}' is not supported yet", node=self.node + ) + return [problem] + + def build_inherited_context(self, context: DependencyGraphContext, child_path: Path) -> InheritedContext: + magic = self.as_magic() + if magic is not None: + return magic.build_inherited_context(context, child_path) + return InheritedContext(None, False) + + +class MagicNode(NodeNG): + pass + + +class MagicCommand(ABC): + + def __init__(self, node: NodeNG, code: str): + self._node = node + self._code = code + + @abstractmethod + def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: ... + + def build_inherited_context(self, _context: DependencyGraphContext, _child_path: Path) -> InheritedContext: + return InheritedContext(None, False) + + +_FACTORIES: list[Callable[[str, NodeNG], MagicCommand | None]] = [] + + +def magic_command_factory(func: Callable[[str, NodeNG], MagicCommand | None]): + _FACTORIES.append(func) + + def inner(command: str, node: NodeNG) -> MagicCommand | None: + return func(command, node) + + return inner diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index ab7a51cf3c..c17f937cbe 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -35,7 +35,7 @@ SysPathChange, UnresolvedPath, ) -from databricks.labs.ucx.source_code.python.python_analyzer import MagicLine +from databricks.labs.ucx.source_code.notebooks.magic import MagicLine from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase from databricks.labs.ucx.source_code.notebooks.cells import ( CellLanguage, diff --git a/src/databricks/labs/ucx/source_code/python/python_analyzer.py b/src/databricks/labs/ucx/source_code/python/python_analyzer.py index d2a2454b2f..f2c52114ae 100644 --- a/src/databricks/labs/ucx/source_code/python/python_analyzer.py +++ b/src/databricks/labs/ucx/source_code/python/python_analyzer.py @@ -1,18 +1,16 @@ from __future__ import annotations import logging -from abc import ABC, abstractmethod -from collections.abc import Iterable, Callable +from collections.abc import Iterable from pathlib import Path -from typing import cast, TypeVar +from typing import cast -from astroid import AstroidSyntaxError, Call, Const, ImportFrom, NodeNG, Try, Name # type: ignore +from astroid import AstroidSyntaxError, ImportFrom, Try, Name # type: ignore from databricks.labs.ucx.source_code.graph import ( DependencyGraphContext, DependencyProblem, InheritedContext, - DependencyGraph, ) from databricks.labs.ucx.source_code.linters.imports import ( SysPathChange, @@ -21,6 +19,7 @@ NotebookRunCall, UnresolvedPath, ) +from databricks.labs.ucx.source_code.notebooks.magic import MagicLine from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase logger = logging.getLogger(__name__) @@ -194,87 +193,3 @@ def _mutate_path_lookup(self, change: SysPathChange) -> Iterable[DependencyProbl ) return change.apply_to(self._context.path_lookup) - - -T = TypeVar("T") - - -class MagicLine(NodeBase): - - @classmethod - def extract_from_tree( - cls, tree: Tree, problem_factory: Callable[[str, str, NodeNG], T] - ) -> tuple[list[MagicLine], list[T]]: - problems: list[T] = [] - commands: list[MagicLine] = [] - try: - nodes = tree.locate(Call, [("magic_command", Name)]) - for command in cls._make_commands_for_magic_command_call_nodes(nodes): - commands.append(command) - except Exception as e: # pylint: disable=broad-except - logger.debug(f"Internal error while checking magic commands in tree: {tree.root}", exc_info=True) - problem = problem_factory('internal-error', f"While checking magic commands: {e}", tree.root) - problems.append(problem) - return commands, problems - - @classmethod - def _make_commands_for_magic_command_call_nodes(cls, nodes: list[Call]): - for node in nodes: - arg = node.args[0] - if isinstance(arg, Const): - yield MagicLine(node, arg.value) - - def __init__(self, node: NodeNG, command: bytes): - super().__init__(node) - self._command = command.decode() - - def as_magic(self) -> MagicCommand | None: - for factory in _FACTORIES: - command = factory(self._command, self.node) - if command is not None: - return command - return None - - def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: - magic = self.as_magic() - if magic is not None: - return magic.build_dependency_graph(parent) - problem = DependencyProblem.from_node( - code='unsupported-magic-line', message=f"magic line '{self._command}' is not supported yet", node=self.node - ) - return [problem] - - def build_inherited_context(self, context: DependencyGraphContext, child_path: Path) -> InheritedContext: - magic = self.as_magic() - if magic is not None: - return magic.build_inherited_context(context, child_path) - return InheritedContext(None, False) - - -class MagicNode(NodeNG): - pass - - -class MagicCommand(ABC): - - def __init__(self, node: NodeNG, code: str): - self._node = node - self._code = code - - @abstractmethod - def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: ... - - def build_inherited_context(self, _context: DependencyGraphContext, _child_path: Path) -> InheritedContext: - return InheritedContext(None, False) - - -_FACTORIES: list[Callable[[str, NodeNG], MagicCommand | None]] = [] - - -def magic_command_factory(func: Callable[[str, NodeNG], MagicCommand | None]): - _FACTORIES.append(func) - - def inner(command: str, node: NodeNG) -> MagicCommand | None: - return func(command, node) - - return inner diff --git a/tests/unit/source_code/notebooks/test_cells.py b/tests/unit/source_code/notebooks/test_cells.py index 25f794f2eb..9d1988cd12 100644 --- a/tests/unit/source_code/notebooks/test_cells.py +++ b/tests/unit/source_code/notebooks/test_cells.py @@ -8,7 +8,7 @@ from databricks.labs.ucx.source_code.base import CurrentSessionState from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver, DependencyProblem from databricks.labs.ucx.source_code.linters.files import FileLoader, ImportFileResolver -from databricks.labs.ucx.source_code.python.python_analyzer import MagicLine +from databricks.labs.ucx.source_code.notebooks.magic import MagicLine from databricks.labs.ucx.source_code.python.python_ast import Tree from databricks.labs.ucx.source_code.notebooks.cells import ( CellLanguage, From 7f93d107623d122f61ef892b6eadb94d509ac96b Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 4 Sep 2024 16:44:56 +0200 Subject: [PATCH 31/80] formatting --- src/databricks/labs/ucx/mixins/fixtures.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/mixins/fixtures.py b/src/databricks/labs/ucx/mixins/fixtures.py index 2e26293eef..ac06c8b6b7 100644 --- a/src/databricks/labs/ucx/mixins/fixtures.py +++ b/src/databricks/labs/ucx/mixins/fixtures.py @@ -1248,7 +1248,9 @@ def create() -> Wait[ServingEndpointDetailed]: endpoint_name, EndpointCoreConfigInput( served_models=[ - ServedModelInput(model.name, "1", ServedModelInputWorkloadSize.SMALL, scale_to_zero_enabled=True) + ServedModelInput( + model.name, "1", workload_size=ServedModelInputWorkloadSize.SMALL, scale_to_zero_enabled=True + ) ] ), ) From 7192e851a6f7c8225f931f21dd7fe83ea19e26c7 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 4 Sep 2024 16:57:13 +0200 Subject: [PATCH 32/80] fix failing tests --- .../samples/functional/file-access/create_location.sql | 1 + .../source_code/samples/functional/file-access/select_format.sql | 1 + .../samples/functional/file-access/select_read_files.sql | 1 + 3 files changed, 3 insertions(+) diff --git a/tests/unit/source_code/samples/functional/file-access/create_location.sql b/tests/unit/source_code/samples/functional/file-access/create_location.sql index 4f90fd669d..2b6b4b3aeb 100644 --- a/tests/unit/source_code/samples/functional/file-access/create_location.sql +++ b/tests/unit/source_code/samples/functional/file-access/create_location.sql @@ -1,2 +1,3 @@ -- Databricks notebook source +-- ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: s3a://db-gtm-industry-solutions/data/fsi/capm/sp_500/ CREATE TABLE hive_metastore.indices_historical_data.sp_500 LOCATION 's3a://db-gtm-industry-solutions/data/fsi/capm/sp_500/' diff --git a/tests/unit/source_code/samples/functional/file-access/select_format.sql b/tests/unit/source_code/samples/functional/file-access/select_format.sql index 76d91894f2..d64358a23d 100644 --- a/tests/unit/source_code/samples/functional/file-access/select_format.sql +++ b/tests/unit/source_code/samples/functional/file-access/select_format.sql @@ -1,2 +1,3 @@ -- Databricks notebook source +-- ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: hdfs://examples/src/main/resources/users.parquet SELECT * FROM parquet.`hdfs://examples/src/main/resources/users.parquet` diff --git a/tests/unit/source_code/samples/functional/file-access/select_read_files.sql b/tests/unit/source_code/samples/functional/file-access/select_read_files.sql index e326eec5f5..cd2c86cbe1 100644 --- a/tests/unit/source_code/samples/functional/file-access/select_read_files.sql +++ b/tests/unit/source_code/samples/functional/file-access/select_read_files.sql @@ -1,2 +1,3 @@ -- Databricks notebook source +-- ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: s3a://db-gtm-industry-solutions/data/fsi/capm/sp_500/file.csv SELECT * FROM read_files("s3a://db-gtm-industry-solutions/data/fsi/capm/sp_500/file.csv") LIMIT 10 From d4be072df9cf007cc2b4af37e93992dd497cf708 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 4 Sep 2024 18:18:33 +0200 Subject: [PATCH 33/80] formatting --- src/databricks/labs/ucx/source_code/dfsa_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/source_code/dfsa_crawler.py b/src/databricks/labs/ucx/source_code/dfsa_crawler.py index e4abaa81c7..c552628dbe 100644 --- a/src/databricks/labs/ucx/source_code/dfsa_crawler.py +++ b/src/databricks/labs/ucx/source_code/dfsa_crawler.py @@ -8,6 +8,7 @@ logger = logging.getLogger(__name__) + class DfsaCrawler(CrawlerBase): def __init__(self, backend: SqlBackend, schema: str): @@ -29,4 +30,3 @@ def append(self, dfsas: Sequence[DFSA]): def snapshot(self) -> Iterable[DFSA]: sql = f"SELECT * FROM {self.full_name}" yield from self._backend.fetch(sql) - From e552e3efd6fec09607ab2ac0e568c95f61337cb0 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 5 Sep 2024 11:13:57 +0200 Subject: [PATCH 34/80] rename dfsa -> directfs --- .../labs/ucx/source_code/linters/context.py | 6 +- .../linters/{dfsa.py => directfs.py} | 58 +++++++++---------- .../labs/ucx/source_code/linters/pyspark.py | 4 +- .../{test_dfsa.py => test_directfs.py} | 18 +++--- 4 files changed, 43 insertions(+), 43 deletions(-) rename src/databricks/labs/ucx/source_code/linters/{dfsa.py => directfs.py} (80%) rename tests/unit/source_code/linters/{test_dfsa.py => test_directfs.py} (88%) diff --git a/src/databricks/labs/ucx/source_code/linters/context.py b/src/databricks/labs/ucx/source_code/linters/context.py index 1106b85612..9cec44b2eb 100644 --- a/src/databricks/labs/ucx/source_code/linters/context.py +++ b/src/databricks/labs/ucx/source_code/linters/context.py @@ -12,7 +12,7 @@ PythonLinter, SqlLinter, ) -from databricks.labs.ucx.source_code.linters.dfsa import DfsaPyLinter, DfsaSqlLinter +from databricks.labs.ucx.source_code.linters.directfs import DirectFsPyLinter, DirectFsSqlLinter from databricks.labs.ucx.source_code.linters.imports import DbutilsPyLinter from databricks.labs.ucx.source_code.linters.pyspark import SparkSqlPyLinter @@ -40,12 +40,12 @@ def __init__(self, index: MigrationIndex | None = None, session_state: CurrentSe python_fixers.append(SparkSqlPyLinter(from_table, index, session_state)) python_linters += [ - DfsaPyLinter(session_state), + DirectFsPyLinter(session_state), DBRv8d0PyLinter(dbr_version=session_state.dbr_version), SparkConnectPyLinter(session_state), DbutilsPyLinter(session_state), ] - sql_linters.append(DfsaSqlLinter()) + sql_linters.append(DirectFsSqlLinter()) self._linters: dict[Language, list[SqlLinter] | list[PythonLinter]] = { Language.PYTHON: python_linters, diff --git a/src/databricks/labs/ucx/source_code/linters/dfsa.py b/src/databricks/labs/ucx/source_code/linters/directfs.py similarity index 80% rename from src/databricks/labs/ucx/source_code/linters/dfsa.py rename to src/databricks/labs/ucx/source_code/linters/directfs.py index da6b343252..b44465ddd6 100644 --- a/src/databricks/labs/ucx/source_code/linters/dfsa.py +++ b/src/databricks/labs/ucx/source_code/linters/directfs.py @@ -20,7 +20,7 @@ logger = logging.getLogger(__name__) -class DFSAPattern(ABC): +class DirectFsPattern(ABC): def __init__(self, prefix: str, allowed_roots: list[str]): self._prefix = prefix @@ -33,43 +33,43 @@ def _matches_allowed_root(self, value: str): return any(value.startswith(f"{self._prefix}/{root}") for root in self._allowed_roots) -class RootPattern(DFSAPattern): +class RootPattern(DirectFsPattern): def _matches_allowed_root(self, value: str): return any(value.startswith(f"/{root}") for root in self._allowed_roots) # the below aims to implement https://docs.databricks.com/en/files/index.html -DFSA_PATTERNS = [ - DFSAPattern("dbfs:/", []), - DFSAPattern("file:/", ["Workspace/", "tmp/"]), - DFSAPattern("s3:/", []), - DFSAPattern("s3n:/", []), - DFSAPattern("s3a:/", []), - DFSAPattern("wasb:/", []), - DFSAPattern("wasbs:/", []), - DFSAPattern("abfs:/", []), - DFSAPattern("abfss:/", []), - DFSAPattern("hdfs:/", []), +DIRECT_FS_PATTERNS = [ + DirectFsPattern("dbfs:/", []), + DirectFsPattern("file:/", ["Workspace/", "tmp/"]), + DirectFsPattern("s3:/", []), + DirectFsPattern("s3n:/", []), + DirectFsPattern("s3a:/", []), + DirectFsPattern("wasb:/", []), + DirectFsPattern("wasbs:/", []), + DirectFsPattern("abfs:/", []), + DirectFsPattern("abfss:/", []), + DirectFsPattern("hdfs:/", []), # "/mnt/" is detected by the below pattern, RootPattern("/", ["Volumes/", "Workspace/", "tmp/"]), ] @dataclass -class DFSA: +class DirectFsAccess: """A DFSA is a record describing a Direct File System Access""" path: str @dataclass -class DFSANode: - dfsa: DFSA +class DirectFsNode: + dfsa: DirectFsAccess node: NodeNG -class _DetectDfsaVisitor(TreeVisitor): +class _DetectDirectFsVisitor(TreeVisitor): """ Visitor that detects file system paths in Python code and checks them against a list of known deprecated paths. @@ -77,7 +77,7 @@ class _DetectDfsaVisitor(TreeVisitor): def __init__(self, session_state: CurrentSessionState, allow_spark_duplicates: bool) -> None: self._session_state = session_state - self._dfsa_nodes: list[DFSANode] = [] + self._directfs_nodes: list[DirectFsNode] = [] self._reported_locations: set[tuple[int, int]] = set() self._allow_spark_duplicates = allow_spark_duplicates @@ -107,8 +107,8 @@ def _check_str_constant(self, source_node, inferred: InferredValue): if Tree(source_node).is_from_module("spark") and not self._allow_spark_duplicates: return value = inferred.as_string() - if any(pattern.matches(value) for pattern in DFSA_PATTERNS): - self._dfsa_nodes.append(DFSANode(DFSA(value), source_node)) + if any(pattern.matches(value) for pattern in DIRECT_FS_PATTERNS): + self._directfs_nodes.append(DirectFsNode(DirectFsAccess(value), source_node)) self._reported_locations.add((source_node.lineno, source_node.col_offset)) def _already_reported(self, source_node: NodeNG, inferred: InferredValue): @@ -116,11 +116,11 @@ def _already_reported(self, source_node: NodeNG, inferred: InferredValue): return any((node.lineno, node.col_offset) in self._reported_locations for node in all_nodes) @property - def dfsa_nodes(self): - return self._dfsa_nodes + def directfs_nodes(self): + return self._directfs_nodes -class DfsaPyLinter(PythonLinter): +class DirectFsPyLinter(PythonLinter): def __init__(self, session_state: CurrentSessionState, allow_spark_duplicates=False): self._session_state = session_state @@ -137,18 +137,18 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]: """ Lints the code looking for file system paths that are deprecated """ - visitor = _DetectDfsaVisitor(self._session_state, self._allow_spark_duplicates) + visitor = _DetectDirectFsVisitor(self._session_state, self._allow_spark_duplicates) visitor.visit(tree.node) - for dfsa_node in visitor.dfsa_nodes: + for directfs_node in visitor.directfs_nodes: advisory = Deprecation.from_node( code='direct-filesystem-access', - message=f"The use of direct filesystem references is deprecated: {dfsa_node.dfsa.path}", - node=dfsa_node.node, + message=f"The use of direct filesystem references is deprecated: {directfs_node.dfsa.path}", + node=directfs_node.node, ) yield advisory -class DfsaSqlLinter(SqlLinter): +class DirectFsSqlLinter(SqlLinter): @staticmethod def name() -> str: @@ -164,7 +164,7 @@ def _check_dfsa(self, table: Table) -> Iterable[Advice]: Check if the table is a DBFS table or reference in some way and yield a deprecation message if it is """ - if any(pattern.matches(table.name) for pattern in DFSA_PATTERNS): + if any(pattern.matches(table.name) for pattern in DIRECT_FS_PATTERNS): yield Deprecation( code='direct-filesystem-access-in-sql-query', message=f"The use of direct filesystem references is deprecated: {table.name}", diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index a537757add..4cbe11d506 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -13,7 +13,7 @@ CurrentSessionState, PythonLinter, ) -from databricks.labs.ucx.source_code.linters.dfsa import DFSA_PATTERNS +from databricks.labs.ucx.source_code.linters.directfs import DIRECT_FS_PATTERNS from databricks.labs.ucx.source_code.python.python_infer import InferredValue from databricks.labs.ucx.source_code.queries import FromTableSqlLinter from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper @@ -201,7 +201,7 @@ def lint( logger.debug(f"Could not infer value of {table_arg.as_string()}") continue value = inferred.as_string() - if any(pattern.matches(value) for pattern in DFSA_PATTERNS): + if any(pattern.matches(value) for pattern in DIRECT_FS_PATTERNS): yield Deprecation.from_node( code='direct-filesystem-access', message=f"The use of direct filesystem references is deprecated: {value}", diff --git a/tests/unit/source_code/linters/test_dfsa.py b/tests/unit/source_code/linters/test_directfs.py similarity index 88% rename from tests/unit/source_code/linters/test_dfsa.py rename to tests/unit/source_code/linters/test_directfs.py index 11cb5a1d08..44e38406e9 100644 --- a/tests/unit/source_code/linters/test_dfsa.py +++ b/tests/unit/source_code/linters/test_directfs.py @@ -1,7 +1,7 @@ import pytest from databricks.labs.ucx.source_code.base import Deprecation, Advice, CurrentSessionState, Failure -from databricks.labs.ucx.source_code.linters.dfsa import DfsaPyLinter, DfsaSqlLinter, DFSA_PATTERNS +from databricks.labs.ucx.source_code.linters.directfs import DIRECT_FS_PATTERNS, DirectFsPyLinter, DirectFsSqlLinter @pytest.mark.parametrize( @@ -17,7 +17,7 @@ ) def test_matches_dfsa_pattern(path, matches): """see https://github.com/databrickslabs/ucx/issues/2350""" - matched = any(pattern.matches(path) for pattern in DFSA_PATTERNS) + matched = any(pattern.matches(path) for pattern in DIRECT_FS_PATTERNS) assert matches == matched @@ -33,7 +33,7 @@ def test_matches_dfsa_pattern(path, matches): ], ) def test_detects_dfsa_paths(code, expected): - linter = DfsaPyLinter(CurrentSessionState(), allow_spark_duplicates=True) + linter = DirectFsPyLinter(CurrentSessionState(), allow_spark_duplicates=True) advices = list(linter.lint(code)) for advice in advices: assert isinstance(advice, Advice) @@ -63,7 +63,7 @@ def test_detects_dfsa_paths(code, expected): ], ) def test_dfsa_usage_linter(code, expected): - linter = DfsaPyLinter(CurrentSessionState(), allow_spark_duplicates=True) + linter = DirectFsPyLinter(CurrentSessionState(), allow_spark_duplicates=True) advices = linter.lint(code) count = 0 for advice in advices: @@ -73,7 +73,7 @@ def test_dfsa_usage_linter(code, expected): def test_dfsa_name(): - linter = DfsaPyLinter(CurrentSessionState()) + linter = DirectFsPyLinter(CurrentSessionState()) assert linter.name() == "dfsa-usage" @@ -87,7 +87,7 @@ def test_dfsa_name(): ], ) def test_non_dfsa_triggers_nothing(query): - ftf = DfsaSqlLinter() + ftf = DirectFsSqlLinter() assert not list(ftf.lint(query)) @@ -109,7 +109,7 @@ def test_non_dfsa_triggers_nothing(query): ], ) def test_dfsa_tables_trigger_messages_param(query: str, table: str): - ftf = DfsaSqlLinter() + ftf = DirectFsSqlLinter() actual = list(ftf.lint(query)) assert actual == [ Deprecation( @@ -130,7 +130,7 @@ def test_dfsa_tables_trigger_messages_param(query: str, table: str): ], ) def test_dfsa_queries_failure(query: str): - ftf = DfsaSqlLinter() + ftf = DirectFsSqlLinter() actual = list(ftf.lint(query)) assert actual == [ Failure( @@ -145,5 +145,5 @@ def test_dfsa_queries_failure(query: str): def test_dfsa_queries_name(): - ftf = DfsaSqlLinter() + ftf = DirectFsSqlLinter() assert ftf.name() == 'dfsa-query' From e30ccfa4a45fcafae2fce76d137eef7209472442 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 5 Sep 2024 11:28:51 +0200 Subject: [PATCH 35/80] improve naming and drop /tmp/ exclusion --- .../labs/ucx/source_code/linters/context.py | 6 +- .../labs/ucx/source_code/linters/directfs.py | 57 ++++++++----------- .../labs/ucx/source_code/linters/pyspark.py | 4 +- .../unit/source_code/linters/test_directfs.py | 35 +++++------- .../unit/source_code/linters/test_pyspark.py | 15 ++++- 5 files changed, 54 insertions(+), 63 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/context.py b/src/databricks/labs/ucx/source_code/linters/context.py index 9cec44b2eb..7f9f589680 100644 --- a/src/databricks/labs/ucx/source_code/linters/context.py +++ b/src/databricks/labs/ucx/source_code/linters/context.py @@ -12,7 +12,7 @@ PythonLinter, SqlLinter, ) -from databricks.labs.ucx.source_code.linters.directfs import DirectFsPyLinter, DirectFsSqlLinter +from databricks.labs.ucx.source_code.linters.directfs import DirectFsAccessPyLinter, DirectFsAccessSqlLinter from databricks.labs.ucx.source_code.linters.imports import DbutilsPyLinter from databricks.labs.ucx.source_code.linters.pyspark import SparkSqlPyLinter @@ -40,12 +40,12 @@ def __init__(self, index: MigrationIndex | None = None, session_state: CurrentSe python_fixers.append(SparkSqlPyLinter(from_table, index, session_state)) python_linters += [ - DirectFsPyLinter(session_state), + DirectFsAccessPyLinter(session_state), DBRv8d0PyLinter(dbr_version=session_state.dbr_version), SparkConnectPyLinter(session_state), DbutilsPyLinter(session_state), ] - sql_linters.append(DirectFsSqlLinter()) + sql_linters.append(DirectFsAccessSqlLinter()) self._linters: dict[Language, list[SqlLinter] | list[PythonLinter]] = { Language.PYTHON: python_linters, diff --git a/src/databricks/labs/ucx/source_code/linters/directfs.py b/src/databricks/labs/ucx/source_code/linters/directfs.py index b44465ddd6..1ba4440b73 100644 --- a/src/databricks/labs/ucx/source_code/linters/directfs.py +++ b/src/databricks/labs/ucx/source_code/linters/directfs.py @@ -20,7 +20,7 @@ logger = logging.getLogger(__name__) -class DirectFsPattern(ABC): +class DirectFsAccessPattern(ABC): def __init__(self, prefix: str, allowed_roots: list[str]): self._prefix = prefix @@ -33,26 +33,26 @@ def _matches_allowed_root(self, value: str): return any(value.startswith(f"{self._prefix}/{root}") for root in self._allowed_roots) -class RootPattern(DirectFsPattern): +class RootPattern(DirectFsAccessPattern): def _matches_allowed_root(self, value: str): return any(value.startswith(f"/{root}") for root in self._allowed_roots) # the below aims to implement https://docs.databricks.com/en/files/index.html -DIRECT_FS_PATTERNS = [ - DirectFsPattern("dbfs:/", []), - DirectFsPattern("file:/", ["Workspace/", "tmp/"]), - DirectFsPattern("s3:/", []), - DirectFsPattern("s3n:/", []), - DirectFsPattern("s3a:/", []), - DirectFsPattern("wasb:/", []), - DirectFsPattern("wasbs:/", []), - DirectFsPattern("abfs:/", []), - DirectFsPattern("abfss:/", []), - DirectFsPattern("hdfs:/", []), +DIRECT_FS_ACCESS_PATTERNS = [ + DirectFsAccessPattern("dbfs:/", []), + DirectFsAccessPattern("file:/", ["Workspace/"]), + DirectFsAccessPattern("s3:/", []), + DirectFsAccessPattern("s3n:/", []), + DirectFsAccessPattern("s3a:/", []), + DirectFsAccessPattern("wasb:/", []), + DirectFsAccessPattern("wasbs:/", []), + DirectFsAccessPattern("abfs:/", []), + DirectFsAccessPattern("abfss:/", []), + DirectFsAccessPattern("hdfs:/", []), # "/mnt/" is detected by the below pattern, - RootPattern("/", ["Volumes/", "Workspace/", "tmp/"]), + RootPattern("/", ["Volumes/", "Workspace/"]), ] @@ -64,12 +64,12 @@ class DirectFsAccess: @dataclass -class DirectFsNode: +class DirectFsAccessNode: dfsa: DirectFsAccess node: NodeNG -class _DetectDirectFsVisitor(TreeVisitor): +class _DetectDirectFsAccessVisitor(TreeVisitor): """ Visitor that detects file system paths in Python code and checks them against a list of known deprecated paths. @@ -77,7 +77,7 @@ class _DetectDirectFsVisitor(TreeVisitor): def __init__(self, session_state: CurrentSessionState, allow_spark_duplicates: bool) -> None: self._session_state = session_state - self._directfs_nodes: list[DirectFsNode] = [] + self._directfs_nodes: list[DirectFsAccessNode] = [] self._reported_locations: set[tuple[int, int]] = set() self._allow_spark_duplicates = allow_spark_duplicates @@ -107,8 +107,8 @@ def _check_str_constant(self, source_node, inferred: InferredValue): if Tree(source_node).is_from_module("spark") and not self._allow_spark_duplicates: return value = inferred.as_string() - if any(pattern.matches(value) for pattern in DIRECT_FS_PATTERNS): - self._directfs_nodes.append(DirectFsNode(DirectFsAccess(value), source_node)) + if any(pattern.matches(value) for pattern in DIRECT_FS_ACCESS_PATTERNS): + self._directfs_nodes.append(DirectFsAccessNode(DirectFsAccess(value), source_node)) self._reported_locations.add((source_node.lineno, source_node.col_offset)) def _already_reported(self, source_node: NodeNG, inferred: InferredValue): @@ -120,24 +120,17 @@ def directfs_nodes(self): return self._directfs_nodes -class DirectFsPyLinter(PythonLinter): +class DirectFsAccessPyLinter(PythonLinter): def __init__(self, session_state: CurrentSessionState, allow_spark_duplicates=False): self._session_state = session_state self._allow_spark_duplicates = allow_spark_duplicates - @staticmethod - def name() -> str: - """ - Returns the name of the linter, for reporting etc - """ - return 'dfsa-usage' - def lint_tree(self, tree: Tree) -> Iterable[Advice]: """ Lints the code looking for file system paths that are deprecated """ - visitor = _DetectDirectFsVisitor(self._session_state, self._allow_spark_duplicates) + visitor = _DetectDirectFsAccessVisitor(self._session_state, self._allow_spark_duplicates) visitor.visit(tree.node) for directfs_node in visitor.directfs_nodes: advisory = Deprecation.from_node( @@ -148,11 +141,7 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]: yield advisory -class DirectFsSqlLinter(SqlLinter): - - @staticmethod - def name() -> str: - return 'dfsa-query' +class DirectFsAccessSqlLinter(SqlLinter): def lint_expression(self, expression: Expression): for table in expression.find_all(Table): @@ -164,7 +153,7 @@ def _check_dfsa(self, table: Table) -> Iterable[Advice]: Check if the table is a DBFS table or reference in some way and yield a deprecation message if it is """ - if any(pattern.matches(table.name) for pattern in DIRECT_FS_PATTERNS): + if any(pattern.matches(table.name) for pattern in DIRECT_FS_ACCESS_PATTERNS): yield Deprecation( code='direct-filesystem-access-in-sql-query', message=f"The use of direct filesystem references is deprecated: {table.name}", diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index 4cbe11d506..7de8cbfbb6 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -13,7 +13,7 @@ CurrentSessionState, PythonLinter, ) -from databricks.labs.ucx.source_code.linters.directfs import DIRECT_FS_PATTERNS +from databricks.labs.ucx.source_code.linters.directfs import DIRECT_FS_ACCESS_PATTERNS from databricks.labs.ucx.source_code.python.python_infer import InferredValue from databricks.labs.ucx.source_code.queries import FromTableSqlLinter from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper @@ -201,7 +201,7 @@ def lint( logger.debug(f"Could not infer value of {table_arg.as_string()}") continue value = inferred.as_string() - if any(pattern.matches(value) for pattern in DIRECT_FS_PATTERNS): + if any(pattern.matches(value) for pattern in DIRECT_FS_ACCESS_PATTERNS): yield Deprecation.from_node( code='direct-filesystem-access', message=f"The use of direct filesystem references is deprecated: {value}", diff --git a/tests/unit/source_code/linters/test_directfs.py b/tests/unit/source_code/linters/test_directfs.py index 44e38406e9..e14b87b242 100644 --- a/tests/unit/source_code/linters/test_directfs.py +++ b/tests/unit/source_code/linters/test_directfs.py @@ -1,7 +1,11 @@ import pytest from databricks.labs.ucx.source_code.base import Deprecation, Advice, CurrentSessionState, Failure -from databricks.labs.ucx.source_code.linters.directfs import DIRECT_FS_PATTERNS, DirectFsPyLinter, DirectFsSqlLinter +from databricks.labs.ucx.source_code.linters.directfs import ( + DIRECT_FS_ACCESS_PATTERNS, + DirectFsAccessPyLinter, + DirectFsAccessSqlLinter, +) @pytest.mark.parametrize( @@ -11,13 +15,13 @@ ("dbfs:/mnt/foo/bar", True), ("s3a://bucket1/folder1", True), ("/dbfs/mnt/foo/bar", True), - ("/tmp/foo", False), + ("/tmp/foo", True), ("table.we.know.nothing.about", False), ], ) def test_matches_dfsa_pattern(path, matches): """see https://github.com/databrickslabs/ucx/issues/2350""" - matched = any(pattern.matches(path) for pattern in DIRECT_FS_PATTERNS) + matched = any(pattern.matches(path) for pattern in DIRECT_FS_ACCESS_PATTERNS) assert matches == matched @@ -33,7 +37,7 @@ def test_matches_dfsa_pattern(path, matches): ], ) def test_detects_dfsa_paths(code, expected): - linter = DirectFsPyLinter(CurrentSessionState(), allow_spark_duplicates=True) + linter = DirectFsAccessPyLinter(CurrentSessionState(), allow_spark_duplicates=True) advices = list(linter.lint(code)) for advice in advices: assert isinstance(advice, Advice) @@ -63,7 +67,7 @@ def test_detects_dfsa_paths(code, expected): ], ) def test_dfsa_usage_linter(code, expected): - linter = DirectFsPyLinter(CurrentSessionState(), allow_spark_duplicates=True) + linter = DirectFsAccessPyLinter(CurrentSessionState(), allow_spark_duplicates=True) advices = linter.lint(code) count = 0 for advice in advices: @@ -72,11 +76,6 @@ def test_dfsa_usage_linter(code, expected): assert count == expected -def test_dfsa_name(): - linter = DirectFsPyLinter(CurrentSessionState()) - assert linter.name() == "dfsa-usage" - - @pytest.mark.parametrize( "query", [ @@ -87,7 +86,7 @@ def test_dfsa_name(): ], ) def test_non_dfsa_triggers_nothing(query): - ftf = DirectFsSqlLinter() + ftf = DirectFsAccessSqlLinter() assert not list(ftf.lint(query)) @@ -98,10 +97,7 @@ def test_non_dfsa_triggers_nothing(query): ("SELECT * FROM delta.`/mnt/...` WHERE foo > 6", "/mnt/..."), ("SELECT * FROM json.`/a/b/c` WHERE foo > 6", "/a/b/c"), ("DELETE FROM json.`/...` WHERE foo = 'bar'", "/..."), - ( - "MERGE INTO delta.`/dbfs/...` t USING source ON t.key = source.key WHEN MATCHED THEN DELETE", - "/dbfs/...", - ), + ("MERGE INTO delta.`/dbfs/...` t USING source ON t.key = source.key WHEN MATCHED THEN DELETE", "/dbfs/..."), ("SELECT * FROM json.`s3a://abc/d/e/f`", "s3a://abc/d/e/f"), ("SELECT * FROM delta.`s3a://abc/d/e/f` WHERE foo > 6", "s3a://abc/d/e/f"), ("SELECT * FROM delta.`s3a://foo/bar`", "s3a://foo/bar"), @@ -109,7 +105,7 @@ def test_non_dfsa_triggers_nothing(query): ], ) def test_dfsa_tables_trigger_messages_param(query: str, table: str): - ftf = DirectFsSqlLinter() + ftf = DirectFsAccessSqlLinter() actual = list(ftf.lint(query)) assert actual == [ Deprecation( @@ -130,7 +126,7 @@ def test_dfsa_tables_trigger_messages_param(query: str, table: str): ], ) def test_dfsa_queries_failure(query: str): - ftf = DirectFsSqlLinter() + ftf = DirectFsAccessSqlLinter() actual = list(ftf.lint(query)) assert actual == [ Failure( @@ -142,8 +138,3 @@ def test_dfsa_queries_failure(query: str): end_col=1024, ), ] - - -def test_dfsa_queries_name(): - ftf = DirectFsSqlLinter() - assert ftf.name() == 'dfsa-query' diff --git a/tests/unit/source_code/linters/test_pyspark.py b/tests/unit/source_code/linters/test_pyspark.py index 639763d549..bd50b9dc0f 100644 --- a/tests/unit/source_code/linters/test_pyspark.py +++ b/tests/unit/source_code/linters/test_pyspark.py @@ -559,12 +559,23 @@ def test_spark_cloud_direct_access(empty_index, code, expected): @pytest.mark.parametrize("fs_function", FS_FUNCTIONS) -def test_direct_cloud_access_to_tmp_reports_nothing(empty_index, fs_function): +def test_direct_cloud_access_to_workspace_reports_nothing(empty_index, fs_function): session_state = CurrentSessionState() ftf = FromTableSqlLinter(empty_index, session_state) sqf = SparkSqlPyLinter(ftf, empty_index, session_state) # ls function calls have to be from dbutils.fs, or we ignore them - code = f"""spark.{fs_function}("/tmp/bucket/path")""" + code = f"""spark.{fs_function}("/Workspace/bucket/path")""" + advisories = list(sqf.lint(code)) + assert not advisories + + +@pytest.mark.parametrize("fs_function", FS_FUNCTIONS) +def test_direct_cloud_access_to_volumes_reports_nothing(empty_index, fs_function): + session_state = CurrentSessionState() + ftf = FromTableSqlLinter(empty_index, session_state) + sqf = SparkSqlPyLinter(ftf, empty_index, session_state) + # ls function calls have to be from dbutils.fs, or we ignore them + code = f"""spark.{fs_function}("/Volumes/bucket/path")""" advisories = list(sqf.lint(code)) assert not advisories From 4c48951052069ca9b4718d4f7ec367ec44a97d0a Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 5 Sep 2024 11:43:06 +0200 Subject: [PATCH 36/80] Update docs --- README.md | 52 +++++++++++++++++----------------------------------- 1 file changed, 17 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index bb1292dc49..7045365640 100644 --- a/README.md +++ b/README.md @@ -64,12 +64,10 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project. * [`cannot-autofix-table-reference`](#cannot-autofix-table-reference) * [`catalog-api-in-shared-clusters`](#catalog-api-in-shared-clusters) * [`changed-result-format-in-uc`](#changed-result-format-in-uc) - * [`dbfs-read-from-sql-query`](#dbfs-read-from-sql-query) - * [`dbfs-usage`](#dbfs-usage) + * [`direct-filesystem-access`](#direct-filesystem-access) + * [`direct-filesystem-access-in-sql-query`](#direct-filesystem-access-in-sql-query) * [`default-format-changed-in-dbr8`](#default-format-changed-in-dbr8) * [`dependency-not-found`](#dependency-not-found) - * [`direct-filesystem-access`](#direct-filesystem-access) - * [`implicit-dbfs-usage`](#implicit-dbfs-usage) * [`jvm-access-in-shared-clusters`](#jvm-access-in-shared-clusters) * [`legacy-context-in-shared-clusters`](#legacy-context-in-shared-clusters) * [`not-supported`](#not-supported) @@ -766,24 +764,32 @@ you need to make sure that `do_stuff_with_table` can handle the new format. [[back to top](#databricks-labs-ucx)] -#### `dbfs-read-from-sql-query` +#### `direct-filesystem-access-in-sql-query` -DBFS access is not allowed in Unity Catalog, so if you have code like this: +Direct filesystem access is deprecated in Unity Catalog. +DBFS is no longer supported, so if you have code like this: ```python -df = spark.sql("SELECT * FROM parquet.`/mnt/foo/path/to/file`") +df = spark.sql("SELECT * FROM parquet.`/mnt/foo/path/to/parquet.file`") ``` you need to change it to use UC tables. [[back to top](#databricks-labs-ucx)] -#### `dbfs-usage` +#### `direct-filesystem-access` -DBFS does not work in Unity Catalog, so if you have code like this: +Direct filesystem access is deprecated in Unity Catalog. +DBFS is no longer supported, so if you have code like this: ```python -display(spark.read.csv('/mnt/things/e/f/g')) +display(spark.read.csv('/mnt/things/data.csv')) +``` + +or this: + +```python +display(spark.read.csv('s3:/bucket/folder/data.csv')) ``` You need to change it to use UC tables or UC volumes. @@ -798,31 +804,7 @@ means an error in the user code. [[back to top](#databricks-labs-ucx)] -#### `direct-filesystem-access` - -It's not allowed to access the filesystem directly in Unity Catalog, so if you have code like this: - -```python -spark.read.csv("s3://bucket/path") -``` - -you need to change it to use UC tables or UC volumes. - -[[back to top](#databricks-labs-ucx)] - -#### `implicit-dbfs-usage` - -The use of DBFS is not allowed in Unity Catalog, so if you have code like this: - -```python -display(spark.read.csv('/mnt/things/e/f/g')) -``` - -you need to change it to use UC tables or UC volumes. - -[[back to top](#databricks-labs-ucx)] - -#### `jvm-access-in-shared-clusters` +### `jvm-access-in-shared-clusters` You cannot access Spark Driver JVM on Unity Catalog clusters in Shared Access mode. If you have code like this: From 6e07c9bb2290a324a0a4bec2b4013d549512cc80 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 5 Sep 2024 11:51:34 +0200 Subject: [PATCH 37/80] move to functional test --- tests/unit/source_code/linters/test_directfs.py | 13 ++----------- .../samples/functional/file-access/direct-fs3.py | 8 ++++++++ 2 files changed, 10 insertions(+), 11 deletions(-) create mode 100644 tests/unit/source_code/samples/functional/file-access/direct-fs3.py diff --git a/tests/unit/source_code/linters/test_directfs.py b/tests/unit/source_code/linters/test_directfs.py index e14b87b242..931973a60b 100644 --- a/tests/unit/source_code/linters/test_directfs.py +++ b/tests/unit/source_code/linters/test_directfs.py @@ -55,18 +55,9 @@ def test_detects_dfsa_paths(code, expected): ('spark.read.parquet("dbfs:/mnt/foo/bar")', 1), ('spark.read.parquet("dbfs://mnt/foo/bar")', 1), ('DBFS="dbfs:/mnt/foo/bar"; spark.read.parquet(DBFS)', 1), - ( - """ -DBFS1="dbfs:/mnt/foo/bar1" -systems=[DBFS1, "dbfs:/mnt/foo/bar2"] -for system in systems: - spark.read.parquet(system) -""", - 2, - ), ], ) -def test_dfsa_usage_linter(code, expected): +def test_directfs_linter(code, expected): linter = DirectFsAccessPyLinter(CurrentSessionState(), allow_spark_duplicates=True) advices = linter.lint(code) count = 0 @@ -97,7 +88,7 @@ def test_non_dfsa_triggers_nothing(query): ("SELECT * FROM delta.`/mnt/...` WHERE foo > 6", "/mnt/..."), ("SELECT * FROM json.`/a/b/c` WHERE foo > 6", "/a/b/c"), ("DELETE FROM json.`/...` WHERE foo = 'bar'", "/..."), - ("MERGE INTO delta.`/dbfs/...` t USING source ON t.key = source.key WHEN MATCHED THEN DELETE", "/dbfs/..."), + ("MERGE INTO delta.`/dbfs/...` t USING src ON t.key = src.key WHEN MATCHED THEN DELETE", "/dbfs/..."), ("SELECT * FROM json.`s3a://abc/d/e/f`", "s3a://abc/d/e/f"), ("SELECT * FROM delta.`s3a://abc/d/e/f` WHERE foo > 6", "s3a://abc/d/e/f"), ("SELECT * FROM delta.`s3a://foo/bar`", "s3a://foo/bar"), diff --git a/tests/unit/source_code/samples/functional/file-access/direct-fs3.py b/tests/unit/source_code/samples/functional/file-access/direct-fs3.py new file mode 100644 index 0000000000..0db9d9a2f1 --- /dev/null +++ b/tests/unit/source_code/samples/functional/file-access/direct-fs3.py @@ -0,0 +1,8 @@ +# ucx[direct-filesystem-access:+1:6:+1:26] The use of direct filesystem references is deprecated: dbfs:/mnt/foo/bar1 +DBFS1="dbfs:/mnt/foo/bar1" +# ucx[direct-filesystem-access:+1:16:+1:36] The use of direct filesystem references is deprecated: dbfs:/mnt/foo/bar2 +systems=[DBFS1, "dbfs:/mnt/foo/bar2"] +for system in systems: + # ucx[direct-filesystem-access:+2:4:+2:30] The use of direct filesystem references is deprecated: dbfs:/mnt/foo/bar1 + # ucx[direct-filesystem-access:+1:4:+1:30] The use of direct filesystem references is deprecated: dbfs:/mnt/foo/bar2 + spark.read.parquet(system) From cd3b115534e533405c265cb04b3a3b64f7450497 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 5 Sep 2024 11:56:12 +0200 Subject: [PATCH 38/80] update docs --- CONTRIBUTING.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c165715eba..48b00fc086 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -286,12 +286,10 @@ $ python tests/integration/source_code/message_codes.py cannot-autofix-table-reference catalog-api-in-shared-clusters changed-result-format-in-uc -dbfs-read-from-sql-query -dbfs-usage +direct-filesystem-access +direct-filesystem-access-in-sql-query default-format-changed-in-dbr8 dependency-not-found -direct-filesystem-access -implicit-dbfs-usage jvm-access-in-shared-clusters legacy-context-in-shared-clusters not-supported From 8fea3eb0bf565ba75e87c9fba2493f93d98368d6 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 5 Sep 2024 14:11:27 +0200 Subject: [PATCH 39/80] improve naming and comments --- .../labs/ucx/source_code/linters/directfs.py | 21 ++++++++++--------- .../unit/source_code/linters/test_directfs.py | 4 ++-- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/directfs.py b/src/databricks/labs/ucx/source_code/linters/directfs.py index 1ba4440b73..2af6af8840 100644 --- a/src/databricks/labs/ucx/source_code/linters/directfs.py +++ b/src/databricks/labs/ucx/source_code/linters/directfs.py @@ -58,8 +58,6 @@ def _matches_allowed_root(self, value: str): @dataclass class DirectFsAccess: - """A DFSA is a record describing a Direct File System Access""" - path: str @@ -75,11 +73,11 @@ class _DetectDirectFsAccessVisitor(TreeVisitor): against a list of known deprecated paths. """ - def __init__(self, session_state: CurrentSessionState, allow_spark_duplicates: bool) -> None: + def __init__(self, session_state: CurrentSessionState, prevent_spark_duplicates: bool) -> None: self._session_state = session_state self._directfs_nodes: list[DirectFsAccessNode] = [] self._reported_locations: set[tuple[int, int]] = set() - self._allow_spark_duplicates = allow_spark_duplicates + self._prevent_spark_duplicates = prevent_spark_duplicates def visit_call(self, node: Call): for arg in node.args: @@ -104,11 +102,14 @@ def _check_str_constant(self, source_node, inferred: InferredValue): if self._already_reported(source_node, inferred): return # avoid duplicate advices that are reported by SparkSqlPyLinter - if Tree(source_node).is_from_module("spark") and not self._allow_spark_duplicates: + if self._prevent_spark_duplicates and Tree(source_node).is_from_module("spark"): return value = inferred.as_string() - if any(pattern.matches(value) for pattern in DIRECT_FS_ACCESS_PATTERNS): - self._directfs_nodes.append(DirectFsAccessNode(DirectFsAccess(value), source_node)) + for pattern in DIRECT_FS_ACCESS_PATTERNS: + if not pattern.matches(value): + continue + dfsa = DirectFsAccess(path=value) + self.directfs_nodes.append(DirectFsAccessNode(dfsa, source_node)) self._reported_locations.add((source_node.lineno, source_node.col_offset)) def _already_reported(self, source_node: NodeNG, inferred: InferredValue): @@ -122,15 +123,15 @@ def directfs_nodes(self): class DirectFsAccessPyLinter(PythonLinter): - def __init__(self, session_state: CurrentSessionState, allow_spark_duplicates=False): + def __init__(self, session_state: CurrentSessionState, prevent_spark_duplicates=True): self._session_state = session_state - self._allow_spark_duplicates = allow_spark_duplicates + self._prevent_spark_duplicates = prevent_spark_duplicates def lint_tree(self, tree: Tree) -> Iterable[Advice]: """ Lints the code looking for file system paths that are deprecated """ - visitor = _DetectDirectFsAccessVisitor(self._session_state, self._allow_spark_duplicates) + visitor = _DetectDirectFsAccessVisitor(self._session_state, self._prevent_spark_duplicates) visitor.visit(tree.node) for directfs_node in visitor.directfs_nodes: advisory = Deprecation.from_node( diff --git a/tests/unit/source_code/linters/test_directfs.py b/tests/unit/source_code/linters/test_directfs.py index 931973a60b..70b933ea00 100644 --- a/tests/unit/source_code/linters/test_directfs.py +++ b/tests/unit/source_code/linters/test_directfs.py @@ -37,7 +37,7 @@ def test_matches_dfsa_pattern(path, matches): ], ) def test_detects_dfsa_paths(code, expected): - linter = DirectFsAccessPyLinter(CurrentSessionState(), allow_spark_duplicates=True) + linter = DirectFsAccessPyLinter(CurrentSessionState(), prevent_spark_duplicates=False) advices = list(linter.lint(code)) for advice in advices: assert isinstance(advice, Advice) @@ -58,7 +58,7 @@ def test_detects_dfsa_paths(code, expected): ], ) def test_directfs_linter(code, expected): - linter = DirectFsAccessPyLinter(CurrentSessionState(), allow_spark_duplicates=True) + linter = DirectFsAccessPyLinter(CurrentSessionState(), prevent_spark_duplicates=False) advices = linter.lint(code) count = 0 for advice in advices: From 64de1e0eaa535ce7872c76285218595bf624abcc Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 5 Sep 2024 14:56:07 +0200 Subject: [PATCH 40/80] fix failing test --- tests/integration/source_code/test_jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index e82384ea09..2c989753ef 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -154,7 +154,7 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job, last_messages = caplog.messages[-1].split("\n") assert all(any(message.endswith(expected) for message in last_messages) for expected in expected_messages) - dfsas = simple_ctx.dfsa_crawler.snapshot() + dfsas = simple_ctx.directfs_access_crawler.snapshot() assert len(list(dfsas)) == 2 From 6ce8f862381ae320bf4fcdc8d0f3f5029ad77252 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 6 Sep 2024 10:29:25 +0200 Subject: [PATCH 41/80] Merge branch 'main' into store-dfsa-records # Conflicts: # README.md # src/databricks/labs/ucx/source_code/base.py # src/databricks/labs/ucx/source_code/jobs.py # src/databricks/labs/ucx/source_code/linters/directfs.py # src/databricks/labs/ucx/source_code/linters/pyspark.py # tests/unit/source_code/linters/test_directfs.py --- src/databricks/labs/ucx/source_code/linters/directfs.py | 5 ----- src/databricks/labs/ucx/source_code/linters/pyspark.py | 3 +-- tests/unit/source_code/linters/test_directfs.py | 1 - 3 files changed, 1 insertion(+), 8 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/directfs.py b/src/databricks/labs/ucx/source_code/linters/directfs.py index a553d69e5b..1043b8b6fc 100644 --- a/src/databricks/labs/ucx/source_code/linters/directfs.py +++ b/src/databricks/labs/ucx/source_code/linters/directfs.py @@ -57,11 +57,6 @@ def _matches_allowed_root(self, value: str): ] -@dataclass -class DirectFsAccess: - path: str - - @dataclass class DirectFsAccessNode: dfsa: DirectFsAccess diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index 8c30341afb..fd94ed513a 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -13,9 +13,8 @@ Fixer, CurrentSessionState, PythonLinter, - DirectFsAccess, ) -from databricks.labs.ucx.source_code.linters.directfs import DirectFsAccessNode, DIRECT_FS_ACCESS_PATTERNS +from databricks.labs.ucx.source_code.linters.directfs import DIRECT_FS_ACCESS_PATTERNS from databricks.labs.ucx.source_code.python.python_infer import InferredValue from databricks.labs.ucx.source_code.queries import FromTableSqlLinter from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper diff --git a/tests/unit/source_code/linters/test_directfs.py b/tests/unit/source_code/linters/test_directfs.py index 083592701e..70b933ea00 100644 --- a/tests/unit/source_code/linters/test_directfs.py +++ b/tests/unit/source_code/linters/test_directfs.py @@ -3,7 +3,6 @@ from databricks.labs.ucx.source_code.base import Deprecation, Advice, CurrentSessionState, Failure from databricks.labs.ucx.source_code.linters.directfs import ( DIRECT_FS_ACCESS_PATTERNS, - DirectFsAccessSqlLinter, DirectFsAccessPyLinter, DirectFsAccessSqlLinter, ) From 5b980df967483f95fe0c1443473c27378d65a79e Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 6 Sep 2024 12:53:33 +0200 Subject: [PATCH 42/80] store DFSAs for paths and queries in dedicated tables --- .../labs/ucx/contexts/application.py | 8 ++++---- .../source_code/directfs_access_crawler.py | 19 ++++++++++++++++--- src/databricks/labs/ucx/source_code/jobs.py | 8 ++++---- tests/unit/source_code/conftest.py | 6 +++--- .../test_directfs_access_crawler.py | 4 ++-- tests/unit/source_code/test_jobs.py | 4 ++-- 6 files changed, 31 insertions(+), 18 deletions(-) diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index aaf3beada0..373bbbc82d 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -15,7 +15,7 @@ from databricks.labs.ucx.recon.metadata_retriever import DatabricksTableMetadataRetriever from databricks.labs.ucx.recon.migration_recon import MigrationRecon from databricks.labs.ucx.recon.schema_comparator import StandardSchemaComparator -from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawler +from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawlers from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver from databricks.sdk import AccountClient, WorkspaceClient, core from databricks.sdk.errors import ResourceDoesNotExist @@ -426,13 +426,13 @@ def workflow_linter(self): self.dependency_resolver, self.path_lookup, MigrationIndex([]), # TODO: bring back self.tables_migrator.index() - self.directfs_access_crawler, + self.directfs_access_crawlers, self.config.include_job_ids, ) @cached_property - def directfs_access_crawler(self): - return DirectFsAccessCrawler(self.sql_backend, self.inventory_database) + def directfs_access_crawlers(self): + return DirectFsAccessCrawlers(self.sql_backend, self.inventory_database) @cached_property def redash(self): diff --git a/src/databricks/labs/ucx/source_code/directfs_access_crawler.py b/src/databricks/labs/ucx/source_code/directfs_access_crawler.py index 63b99a2f75..8512e8da08 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access_crawler.py +++ b/src/databricks/labs/ucx/source_code/directfs_access_crawler.py @@ -10,9 +10,9 @@ logger = logging.getLogger(__name__) -class DirectFsAccessCrawler(CrawlerBase): +class _DirectFsAccessCrawler(CrawlerBase): - def __init__(self, backend: SqlBackend, schema: str): + def __init__(self, backend: SqlBackend, schema: str, table: str): """ Initializes a DFSACrawler instance. @@ -20,7 +20,7 @@ def __init__(self, backend: SqlBackend, schema: str): backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark) schema: The schema name for the inventory persistence. """ - super().__init__(backend, "hive_metastore", schema, "direct_file_system_access", DirectFsAccess) + super().__init__(backend, "hive_metastore", schema, table, DirectFsAccess) def append(self, dfsas: Sequence[DirectFsAccess]): try: @@ -31,3 +31,16 @@ def append(self, dfsas: Sequence[DirectFsAccess]): def snapshot(self) -> Iterable[DirectFsAccess]: sql = f"SELECT * FROM {self.full_name}" yield from self._backend.fetch(sql) + + +class DirectFsAccessCrawlers: + + def __init__(self, backend: SqlBackend, schema: str): + self._backend = backend + self._schema = schema + + def for_paths(self) -> _DirectFsAccessCrawler: + return _DirectFsAccessCrawler(self._backend, self._schema, "direct_file_system_access_in_paths") + + def for_queries(self) -> _DirectFsAccessCrawler: + return _DirectFsAccessCrawler(self._backend, self._schema, "direct_file_system_access_in_queries") diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 47633b7145..b4f080e662 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -28,7 +28,7 @@ guess_encoding, DirectFsAccess, ) -from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawler +from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawlers from databricks.labs.ucx.source_code.graph import ( Dependency, DependencyGraph, @@ -330,14 +330,14 @@ def __init__( resolver: DependencyResolver, path_lookup: PathLookup, migration_index: MigrationIndex, - directfs_crawler: DirectFsAccessCrawler, + directfs_crawlers: DirectFsAccessCrawlers, include_job_ids: list[int] | None = None, ): self._ws = ws self._resolver = resolver self._path_lookup = path_lookup self._migration_index = migration_index - self._directfs_crawler = directfs_crawler + self._directfs_crawlers = directfs_crawlers self._include_job_ids = include_job_ids def refresh_report(self, sql_backend: SqlBackend, inventory_database: str): @@ -426,7 +426,7 @@ def _lint_task(self, task: jobs.Task, job: jobs.Job, linted_paths: set[Path]) -> yield from walker collector = DfsaCollector(graph, set(), self._path_lookup, session_state) dfsas = list(dfsa for dfsa in collector) - self._directfs_crawler.append(dfsas) + self._directfs_crawlers.for_paths().append(dfsas) class LintingWalker(DependencyGraphWalker[LocatedAdvice]): diff --git a/tests/unit/source_code/conftest.py b/tests/unit/source_code/conftest.py index 46884c251c..a4942c24ce 100644 --- a/tests/unit/source_code/conftest.py +++ b/tests/unit/source_code/conftest.py @@ -6,7 +6,7 @@ MigrationStatus, ) from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex -from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawler +from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawlers from databricks.labs.ucx.source_code.graph import DependencyResolver from databricks.labs.ucx.source_code.known import KnownList from databricks.labs.ucx.source_code.linters.files import ImportFileResolver, FileLoader @@ -61,5 +61,5 @@ def simple_dependency_resolver(mock_path_lookup: PathLookup) -> DependencyResolv @pytest.fixture -def mock_dfsa_crawler() -> DirectFsAccessCrawler: - return DirectFsAccessCrawler(MockBackend(), "schema") +def mock_dfsa_crawlers() -> DirectFsAccessCrawlers: + return DirectFsAccessCrawlers(MockBackend(), "schema") diff --git a/tests/unit/source_code/test_directfs_access_crawler.py b/tests/unit/source_code/test_directfs_access_crawler.py index 1cc6da6be6..1964c13238 100644 --- a/tests/unit/source_code/test_directfs_access_crawler.py +++ b/tests/unit/source_code/test_directfs_access_crawler.py @@ -1,12 +1,12 @@ from databricks.labs.lsql.backends import MockBackend from databricks.labs.ucx.source_code.base import DirectFsAccess -from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawler +from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawlers def test_crawler_appends_dfsas(): backend = MockBackend() - crawler = DirectFsAccessCrawler(backend, "schema") + crawler = DirectFsAccessCrawlers(backend, "schema").for_paths() dfsas = list( DirectFsAccess(source_type="SOURCE", source_id="ID", path=path, is_read=False, is_write=False) for path in ("a", "b", "c") diff --git a/tests/unit/source_code/test_jobs.py b/tests/unit/source_code/test_jobs.py index e21af86cfa..7dff7a206b 100644 --- a/tests/unit/source_code/test_jobs.py +++ b/tests/unit/source_code/test_jobs.py @@ -230,12 +230,12 @@ def test_workflow_task_container_builds_dependency_graph_spark_python_task( def test_workflow_linter_lint_job_logs_problems( - dependency_resolver, mock_path_lookup, empty_index, mock_dfsa_crawler, caplog + dependency_resolver, mock_path_lookup, empty_index, mock_dfsa_crawlers, caplog ): expected_message = "Found job problems:\nUNKNOWN:-1 [library-install-failed] 'pip --disable-pip-version-check install unknown-library" ws = create_autospec(WorkspaceClient) - linter = WorkflowLinter(ws, dependency_resolver, mock_path_lookup, empty_index, mock_dfsa_crawler) + linter = WorkflowLinter(ws, dependency_resolver, mock_path_lookup, empty_index, mock_dfsa_crawlers) libraries = [compute.Library(pypi=compute.PythonPyPiLibrary(package="unknown-library-name"))] task = jobs.Task(task_key="test-task", libraries=libraries) From cdcc3e1e4faeac428dfa9531a92e411fa7dcb383 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 6 Sep 2024 14:40:03 +0200 Subject: [PATCH 43/80] support lineage when walking dependency graph --- src/databricks/labs/ucx/source_code/graph.py | 3 +++ tests/unit/source_code/test_graph.py | 28 ++++++++++++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py index 4841fe904c..f2f5f9b9bc 100644 --- a/src/databricks/labs/ucx/source_code/graph.py +++ b/src/databricks/labs/ucx/source_code/graph.py @@ -593,6 +593,7 @@ def __init__(self, graph: DependencyGraph, walked_paths: set[Path], path_lookup: self._graph = graph self._walked_paths = walked_paths self._path_lookup = path_lookup + self._lineage: list[Dependency] = [] def __iter__(self) -> Iterator[T]: for dependency in self._graph.root_dependencies: @@ -604,6 +605,7 @@ def __iter__(self) -> Iterator[T]: def _iter_one(self, dependency: Dependency, graph: DependencyGraph, root_path: Path) -> Iterable[T]: if dependency.path in self._walked_paths: return + self._lineage.append(dependency) self._walked_paths.add(dependency.path) self._log_walk_one(dependency) if dependency.path.is_file() or is_a_notebook(dependency.path): @@ -616,6 +618,7 @@ def _iter_one(self, dependency: Dependency, graph: DependencyGraph, root_path: P child_graph = maybe_graph.graph for child_dependency in child_graph.local_dependencies: yield from self._iter_one(child_dependency, child_graph, root_path) + self._lineage.pop() def _log_walk_one(self, dependency: Dependency): logger.debug(f'Analyzing dependency: {dependency}') diff --git a/tests/unit/source_code/test_graph.py b/tests/unit/source_code/test_graph.py index 06bc24cdbb..a9f620b7bc 100644 --- a/tests/unit/source_code/test_graph.py +++ b/tests/unit/source_code/test_graph.py @@ -1,11 +1,15 @@ from pathlib import Path +from typing import Iterable import pytest from databricks.labs.ucx.source_code.base import CurrentSessionState from databricks.labs.ucx.source_code.linters.files import FileLoader, ImportFileResolver, FolderLoader -from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver, InheritedContext +from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver, InheritedContext, \ + DependencyGraphWalker, T from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader +from databricks.labs.ucx.source_code.path_lookup import PathLookup +from databricks.labs.ucx.source_code.python.python_ast import Tree from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver from databricks.labs.ucx.source_code.known import KnownList @@ -61,7 +65,6 @@ def build_inherited_context(self, root: Path, leaf: Path) -> InheritedContext: @pytest.fixture() def dependency_graph_factory(mock_path_lookup, simple_dependency_resolver): - def new_test_dependency_graph(dependency: Dependency) -> _TestDependencyGraph: return _TestDependencyGraph( dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState() @@ -181,3 +184,24 @@ def test_graph_builds_inherited_context(mock_path_lookup, simple_dependency_reso assert inference_context.tree is not None assert inference_context.tree.has_global("some_table_name") assert not inference_context.tree.has_global("other_table_name") + + +def test_graph_walker_captures_lineage(mock_path_lookup, simple_dependency_resolver): + grand_parent = mock_path_lookup.cwd / "functional/grand_parent_that_magic_runs_parent_that_magic_runs_child.py" + child = mock_path_lookup.cwd / "functional/_child_that_uses_value_from_parent.py" + root_dependency = Dependency(NotebookLoader(), grand_parent) + root_graph = DependencyGraph(root_dependency, None, simple_dependency_resolver, mock_path_lookup, + CurrentSessionState()) + container = root_dependency.load(mock_path_lookup) + container.build_dependency_graph(root_graph) + + class _TestWalker(DependencyGraphWalker): + def _process_dependency(self, dependency: Dependency, path_lookup: PathLookup, inherited_tree: Tree | None) -> Iterable[None]: + if dependency.path.as_posix().endswith(grand_parent.as_posix()): + assert len(self._lineage) == 1 + if dependency.path.as_posix().endswith(child.as_posix()): + assert len(self._lineage) == 3 # there's a parent between grand_parent and child + return [] + + walker = _TestWalker(root_graph, set(), mock_path_lookup) + _ = list(_ for _ in walker) From 1515342918fd2095a29250814e48044c73b89773 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 6 Sep 2024 16:14:40 +0200 Subject: [PATCH 44/80] sore dfsa lineage --- src/databricks/labs/ucx/source_code/base.py | 34 +++++++++++-------- src/databricks/labs/ucx/source_code/graph.py | 14 ++++++-- src/databricks/labs/ucx/source_code/jobs.py | 20 ++++------- .../labs/ucx/source_code/linters/directfs.py | 2 ++ tests/integration/source_code/test_jobs.py | 10 ++++-- .../test_directfs_access_crawler.py | 4 ++- tests/unit/source_code/test_graph.py | 22 ++++++++---- 7 files changed, 65 insertions(+), 41 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index 19c1035d35..8ce91c24fc 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -16,7 +16,6 @@ from databricks.labs.blueprint.paths import WorkspacePath -from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.source_code.python.python_ast import Tree # Code mapping between LSP, PyLint, and our own diagnostics: @@ -340,26 +339,31 @@ def is_a_notebook(path: Path, content: str | None = None) -> bool: @dataclass class DirectFsAccess: - """A DFSA is a record describing a Direct File System Access""" + """A record describing a Direct File System Access""" UNKNOWN = "unknown" source_type: str source_id: str + source_lineage: str path: str is_read: bool is_write: bool - @property - def key(self) -> str: - return f"{self.source_type}.{self.source_id}.{self.path}".lower() # TODO for now - - @property - def safe_sql_key(self) -> str: - return escape_sql_identifier(self.key) - - def __hash__(self) -> int: - return hash(self.key) - - def __eq__(self, other) -> bool: - return isinstance(other, DirectFsAccess) and self.key == other.key + def replace( + self, + source_type: str | None = None, + source_id: str | None = None, + source_lineage: str | None = None, + path: str | None = None, + is_read: bool | None = None, + is_write: bool | None = None, + ): + return DirectFsAccess( + source_type=source_type or self.source_type, + source_id=source_id or self.source_id, + source_lineage=source_lineage or self.source_lineage, + path=path or self.path, + is_read=is_read or self.is_read, + is_write=is_write or self.is_write, + ) diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py index f2f5f9b9bc..0f4804db67 100644 --- a/src/databricks/labs/ucx/source_code/graph.py +++ b/src/databricks/labs/ucx/source_code/graph.py @@ -289,12 +289,13 @@ class DependencyGraphContext: session_state: CurrentSessionState -class Dependency(abc.ABC): +class Dependency: - def __init__(self, loader: DependencyLoader, path: Path, inherits_context=True): + def __init__(self, loader: DependencyLoader, path: Path, inherits_context=True, lineage_str: str | None = None): self._loader = loader self._path = path self._inherits_context = inherits_context + self._lineage_str = lineage_str or '"' + self._path.as_posix() + '"' @property def path(self) -> Path: @@ -316,6 +317,10 @@ def load(self, path_lookup: PathLookup) -> SourceContainer | None: def __repr__(self): return f"Dependency<{self.path}>" + @property + def lineage_str(self): + return self._lineage_str + class SourceContainer(abc.ABC): @@ -627,3 +632,8 @@ def _log_walk_one(self, dependency: Dependency): def _process_dependency( self, dependency: Dependency, path_lookup: PathLookup, inherited_tree: Tree | None ) -> Iterable[T]: ... + + @property + def lineage_str(self): + parts = [dependency.lineage_str for dependency in self._lineage] + return "->".join(parts) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index b4f080e662..00e1e1cdf5 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -68,8 +68,10 @@ def as_message(self) -> str: class WorkflowTask(Dependency): def __init__(self, ws: WorkspaceClient, task: jobs.Task, job: jobs.Job): + # concat job and task for lineage, see DependencyGraphWalker.lineage_str + lineage_str = f'"job:{job.job_id}"->"task:{task.task_key}"' loader = WrappingLoader(WorkflowTaskContainer(ws, task, job)) - super().__init__(loader, Path(f'/jobs/{task.task_key}'), False) + super().__init__(loader, Path(f'/jobs/{task.task_key}'), inherits_context=False, lineage_str=lineage_str) self._task = task self._job = job @@ -424,7 +426,7 @@ def _lint_task(self, task: jobs.Task, job: jobs.Job, linted_paths: set[Path]) -> graph, linted_paths, self._path_lookup, task.task_key, session_state, self._migration_index ) yield from walker - collector = DfsaCollector(graph, set(), self._path_lookup, session_state) + collector = DfsaCollectorWalker(graph, set(), self._path_lookup, session_state) dfsas = list(dfsa for dfsa in collector) self._directfs_crawlers.for_paths().append(dfsas) @@ -458,7 +460,7 @@ def _process_dependency( yield LocatedAdvice(advice, dependency.path) -class DfsaCollector(DependencyGraphWalker[DirectFsAccess]): +class DfsaCollectorWalker(DependencyGraphWalker[DirectFsAccess]): def __init__( self, @@ -490,13 +492,7 @@ def _collect_from_notebook( notebook = Notebook.parse(path, source, language.language) for cell in notebook.cells: for dfsa in self._collect_from_source(cell.original_code, cell.language, path, inherited_tree): - yield DirectFsAccess( - source_type="NOTEBOOK", - source_id=str(path), - path=dfsa.path, - is_read=dfsa.is_read, - is_write=dfsa.is_write, - ) + yield dfsa.replace(source_type="NOTEBOOK", source_id=str(path), source_lineage=self.lineage_str) if cell.language is CellLanguage.PYTHON: if inherited_tree is None: inherited_tree = Tree.new_module() @@ -515,9 +511,7 @@ def _collect_from_source( logger.warning(f"Language {language.name} not supported yet!") return for dfsa in iterable: - yield DirectFsAccess( - source_type="FILE", source_id=str(path), path=dfsa.path, is_read=dfsa.is_read, is_write=dfsa.is_write - ) + yield dfsa.replace(source_type="FILE", source_id=str(path), source_lineage=self.lineage_str) def _collect_from_python(self, source: str, inherited_tree: Tree | None) -> Iterable[DirectFsAccess]: linter = DirectFsAccessPyLinter(self._session_state, prevent_spark_duplicates=False) diff --git a/src/databricks/labs/ucx/source_code/linters/directfs.py b/src/databricks/labs/ucx/source_code/linters/directfs.py index 1043b8b6fc..51f409624c 100644 --- a/src/databricks/labs/ucx/source_code/linters/directfs.py +++ b/src/databricks/labs/ucx/source_code/linters/directfs.py @@ -109,6 +109,7 @@ def _check_str_constant(self, source_node, inferred: InferredValue): dfsa = DirectFsAccess( source_type=DirectFsAccess.UNKNOWN, source_id=DirectFsAccess.UNKNOWN, + source_lineage=DirectFsAccess.UNKNOWN, path=value, is_read=True, is_write=False, @@ -206,6 +207,7 @@ def _collect_dfsa_from_node(cls, expression: SqlExpression, path: str) -> Iterab yield DirectFsAccess( source_type=DirectFsAccess.UNKNOWN, source_id=DirectFsAccess.UNKNOWN, + source_lineage=DirectFsAccess.UNKNOWN, path=path, is_read=is_read, is_write=is_write, diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index 2c989753ef..b9470c4a59 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -20,7 +20,7 @@ from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex from databricks.labs.ucx.mixins.fixtures import get_purge_suffix, factory -from databricks.labs.ucx.source_code.base import CurrentSessionState +from databricks.labs.ucx.source_code.base import CurrentSessionState, DirectFsAccess from databricks.labs.ucx.source_code.graph import Dependency from databricks.labs.ucx.source_code.known import UNKNOWN, KnownList from databricks.labs.ucx.source_code.linters.files import LocalCodeLinter, FileLoader, FolderLoader @@ -154,8 +154,12 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job, last_messages = caplog.messages[-1].split("\n") assert all(any(message.endswith(expected) for message in last_messages) for expected in expected_messages) - dfsas = simple_ctx.directfs_access_crawler.snapshot() - assert len(list(dfsas)) == 2 + dfsas = list(simple_ctx.directfs_access_crawler.snapshot()) + assert len(dfsas) == 2 + for dfsa in dfsas: + assert dfsa.source_type != DirectFsAccess.UNKNOWN + assert dfsa.source_id != DirectFsAccess.UNKNOWN + assert dfsa.source_lineage != DirectFsAccess.UNKNOWN def test_workflow_linter_lints_job_with_import_pypi_library( diff --git a/tests/unit/source_code/test_directfs_access_crawler.py b/tests/unit/source_code/test_directfs_access_crawler.py index 1964c13238..2a9459c511 100644 --- a/tests/unit/source_code/test_directfs_access_crawler.py +++ b/tests/unit/source_code/test_directfs_access_crawler.py @@ -8,7 +8,9 @@ def test_crawler_appends_dfsas(): backend = MockBackend() crawler = DirectFsAccessCrawlers(backend, "schema").for_paths() dfsas = list( - DirectFsAccess(source_type="SOURCE", source_id="ID", path=path, is_read=False, is_write=False) + DirectFsAccess( + source_type="SOURCE", source_id="ID", source_lineage="LINEAGE", path=path, is_read=False, is_write=False + ) for path in ("a", "b", "c") ) crawler.append(dfsas) diff --git a/tests/unit/source_code/test_graph.py b/tests/unit/source_code/test_graph.py index a9f620b7bc..0dd78b4d8e 100644 --- a/tests/unit/source_code/test_graph.py +++ b/tests/unit/source_code/test_graph.py @@ -1,12 +1,17 @@ +from collections.abc import Iterable from pathlib import Path -from typing import Iterable import pytest from databricks.labs.ucx.source_code.base import CurrentSessionState from databricks.labs.ucx.source_code.linters.files import FileLoader, ImportFileResolver, FolderLoader -from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver, InheritedContext, \ - DependencyGraphWalker, T +from databricks.labs.ucx.source_code.graph import ( + Dependency, + DependencyGraph, + DependencyResolver, + InheritedContext, + DependencyGraphWalker, +) from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader from databricks.labs.ucx.source_code.path_lookup import PathLookup from databricks.labs.ucx.source_code.python.python_ast import Tree @@ -190,17 +195,20 @@ def test_graph_walker_captures_lineage(mock_path_lookup, simple_dependency_resol grand_parent = mock_path_lookup.cwd / "functional/grand_parent_that_magic_runs_parent_that_magic_runs_child.py" child = mock_path_lookup.cwd / "functional/_child_that_uses_value_from_parent.py" root_dependency = Dependency(NotebookLoader(), grand_parent) - root_graph = DependencyGraph(root_dependency, None, simple_dependency_resolver, mock_path_lookup, - CurrentSessionState()) + root_graph = DependencyGraph( + root_dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState() + ) container = root_dependency.load(mock_path_lookup) container.build_dependency_graph(root_graph) class _TestWalker(DependencyGraphWalker): - def _process_dependency(self, dependency: Dependency, path_lookup: PathLookup, inherited_tree: Tree | None) -> Iterable[None]: + def _process_dependency( + self, dependency: Dependency, path_lookup: PathLookup, inherited_tree: Tree | None + ) -> Iterable[None]: if dependency.path.as_posix().endswith(grand_parent.as_posix()): assert len(self._lineage) == 1 if dependency.path.as_posix().endswith(child.as_posix()): - assert len(self._lineage) == 3 # there's a parent between grand_parent and child + assert len(self._lineage) == 3 # there's a parent between grand_parent and child return [] walker = _TestWalker(root_graph, set(), mock_path_lookup) From 1c88c97304bcff06852a319468f9c0bae2af8a99 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 6 Sep 2024 16:28:12 +0200 Subject: [PATCH 45/80] fix merge issues --- .../labs/ucx/source_code/directfs_access_crawler.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/directfs_access_crawler.py b/src/databricks/labs/ucx/source_code/directfs_access_crawler.py index 8512e8da08..09f442cd04 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access_crawler.py +++ b/src/databricks/labs/ucx/source_code/directfs_access_crawler.py @@ -1,7 +1,7 @@ import logging from collections.abc import Sequence, Iterable -from databricks.labs.ucx.framework.crawlers import CrawlerBase +from databricks.labs.ucx.framework.crawlers import CrawlerBase, Result from databricks.labs.lsql.backends import SqlBackend from databricks.sdk.errors import DatabricksError @@ -28,10 +28,13 @@ def append(self, dfsas: Sequence[DirectFsAccess]): except DatabricksError as e: logger.error("Failed to store DFSAs", exc_info=e) - def snapshot(self) -> Iterable[DirectFsAccess]: + def _try_fetch(self) -> Iterable[DirectFsAccess]: sql = f"SELECT * FROM {self.full_name}" yield from self._backend.fetch(sql) + def _crawl(self) -> Iterable[Result]: + return [] + class DirectFsAccessCrawlers: From ef261a6cd7101447d773e6eacf8b766493d198aa Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 6 Sep 2024 16:49:03 +0200 Subject: [PATCH 46/80] capture and store source_timestamp --- src/databricks/labs/ucx/source_code/base.py | 3 +++ src/databricks/labs/ucx/source_code/jobs.py | 14 ++++++++++++-- .../labs/ucx/source_code/linters/directfs.py | 2 ++ tests/integration/source_code/test_jobs.py | 3 ++- .../source_code/test_directfs_access_crawler.py | 8 +++++++- 5 files changed, 26 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index 8ce91c24fc..b2b369d6fa 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -346,6 +346,7 @@ class DirectFsAccess: source_type: str source_id: str source_lineage: str + source_timestamp: int path: str is_read: bool is_write: bool @@ -355,6 +356,7 @@ def replace( source_type: str | None = None, source_id: str | None = None, source_lineage: str | None = None, + source_timestamp: int | None = None, path: str | None = None, is_read: bool | None = None, is_write: bool | None = None, @@ -363,6 +365,7 @@ def replace( source_type=source_type or self.source_type, source_id=source_id or self.source_id, source_lineage=source_lineage or self.source_lineage, + source_timestamp=source_timestamp or self.source_timestamp, path=path or self.path, is_read=is_read or self.is_read, is_write=is_write or self.is_write, diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 00e1e1cdf5..6d74f9380b 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -491,8 +491,13 @@ def _collect_from_notebook( ) -> Iterable[DirectFsAccess]: notebook = Notebook.parse(path, source, language.language) for cell in notebook.cells: + src_timestamp = int(path.stat().st_mtime) + src_id = str(path) + src_lineage = self.lineage_str for dfsa in self._collect_from_source(cell.original_code, cell.language, path, inherited_tree): - yield dfsa.replace(source_type="NOTEBOOK", source_id=str(path), source_lineage=self.lineage_str) + yield dfsa.replace( + source_type="NOTEBOOK", source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp + ) if cell.language is CellLanguage.PYTHON: if inherited_tree is None: inherited_tree = Tree.new_module() @@ -510,8 +515,13 @@ def _collect_from_source( if iterable is None: logger.warning(f"Language {language.name} not supported yet!") return + src_id = str(path) + src_lineage = self.lineage_str + src_timestamp = int(path.stat().st_mtime) for dfsa in iterable: - yield dfsa.replace(source_type="FILE", source_id=str(path), source_lineage=self.lineage_str) + yield dfsa.replace( + source_type="FILE", source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp + ) def _collect_from_python(self, source: str, inherited_tree: Tree | None) -> Iterable[DirectFsAccess]: linter = DirectFsAccessPyLinter(self._session_state, prevent_spark_duplicates=False) diff --git a/src/databricks/labs/ucx/source_code/linters/directfs.py b/src/databricks/labs/ucx/source_code/linters/directfs.py index 51f409624c..cf8a347a19 100644 --- a/src/databricks/labs/ucx/source_code/linters/directfs.py +++ b/src/databricks/labs/ucx/source_code/linters/directfs.py @@ -110,6 +110,7 @@ def _check_str_constant(self, source_node, inferred: InferredValue): source_type=DirectFsAccess.UNKNOWN, source_id=DirectFsAccess.UNKNOWN, source_lineage=DirectFsAccess.UNKNOWN, + source_timestamp=-1, path=value, is_read=True, is_write=False, @@ -208,6 +209,7 @@ def _collect_dfsa_from_node(cls, expression: SqlExpression, path: str) -> Iterab source_type=DirectFsAccess.UNKNOWN, source_id=DirectFsAccess.UNKNOWN, source_lineage=DirectFsAccess.UNKNOWN, + source_timestamp=-1, path=path, is_read=is_read, is_write=is_write, diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index b9470c4a59..248e452acc 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -154,12 +154,13 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job, last_messages = caplog.messages[-1].split("\n") assert all(any(message.endswith(expected) for message in last_messages) for expected in expected_messages) - dfsas = list(simple_ctx.directfs_access_crawler.snapshot()) + dfsas = list(simple_ctx.directfs_access_crawlers.for_paths().snapshot()) assert len(dfsas) == 2 for dfsa in dfsas: assert dfsa.source_type != DirectFsAccess.UNKNOWN assert dfsa.source_id != DirectFsAccess.UNKNOWN assert dfsa.source_lineage != DirectFsAccess.UNKNOWN + assert dfsa.source_timestamp != -1 def test_workflow_linter_lints_job_with_import_pypi_library( diff --git a/tests/unit/source_code/test_directfs_access_crawler.py b/tests/unit/source_code/test_directfs_access_crawler.py index 2a9459c511..e34efad717 100644 --- a/tests/unit/source_code/test_directfs_access_crawler.py +++ b/tests/unit/source_code/test_directfs_access_crawler.py @@ -9,7 +9,13 @@ def test_crawler_appends_dfsas(): crawler = DirectFsAccessCrawlers(backend, "schema").for_paths() dfsas = list( DirectFsAccess( - source_type="SOURCE", source_id="ID", source_lineage="LINEAGE", path=path, is_read=False, is_write=False + source_type="SOURCE", + source_id="ID", + source_lineage="LINEAGE", + source_timestamp=7452, + path=path, + is_read=False, + is_write=False, ) for path in ("a", "b", "c") ) From d951286621f92c0de9a447da46539534ba38e8bc Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 6 Sep 2024 17:03:15 +0200 Subject: [PATCH 47/80] simplify --- src/databricks/labs/ucx/source_code/base.py | 20 +++++++++---------- .../labs/ucx/source_code/linters/directfs.py | 8 -------- tests/integration/source_code/test_jobs.py | 3 +++ 3 files changed, 13 insertions(+), 18 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index b2b369d6fa..ed5a0d11c1 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -343,30 +343,30 @@ class DirectFsAccess: UNKNOWN = "unknown" - source_type: str - source_id: str - source_lineage: str - source_timestamp: int path: str is_read: bool is_write: bool + source_type: str = UNKNOWN + source_id: str = UNKNOWN + source_lineage: str = UNKNOWN + source_timestamp: int = -1 def replace( self, + path: str | None = None, + is_read: bool | None = None, + is_write: bool | None = None, source_type: str | None = None, source_id: str | None = None, source_lineage: str | None = None, source_timestamp: int | None = None, - path: str | None = None, - is_read: bool | None = None, - is_write: bool | None = None, ): return DirectFsAccess( + path=path or self.path, + is_read=is_read or self.is_read, + is_write=is_write or self.is_write, source_type=source_type or self.source_type, source_id=source_id or self.source_id, source_lineage=source_lineage or self.source_lineage, source_timestamp=source_timestamp or self.source_timestamp, - path=path or self.path, - is_read=is_read or self.is_read, - is_write=is_write or self.is_write, ) diff --git a/src/databricks/labs/ucx/source_code/linters/directfs.py b/src/databricks/labs/ucx/source_code/linters/directfs.py index cf8a347a19..4a8c01fa67 100644 --- a/src/databricks/labs/ucx/source_code/linters/directfs.py +++ b/src/databricks/labs/ucx/source_code/linters/directfs.py @@ -107,10 +107,6 @@ def _check_str_constant(self, source_node, inferred: InferredValue): # since we're normally filtering out spark calls, we're dealing with dfsas we know little about # notable we don't know is_read or is_write dfsa = DirectFsAccess( - source_type=DirectFsAccess.UNKNOWN, - source_id=DirectFsAccess.UNKNOWN, - source_lineage=DirectFsAccess.UNKNOWN, - source_timestamp=-1, path=value, is_read=True, is_write=False, @@ -206,10 +202,6 @@ def _collect_dfsa_from_node(cls, expression: SqlExpression, path: str) -> Iterab is_read = cls._is_read(expression) is_write = cls._is_write(expression) yield DirectFsAccess( - source_type=DirectFsAccess.UNKNOWN, - source_id=DirectFsAccess.UNKNOWN, - source_lineage=DirectFsAccess.UNKNOWN, - source_timestamp=-1, path=path, is_read=is_read, is_write=is_write, diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index 248e452acc..21b9e6ead9 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -156,7 +156,10 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job, dfsas = list(simple_ctx.directfs_access_crawlers.for_paths().snapshot()) assert len(dfsas) == 2 + task_keys = set(task.task_key for task in j.settings.tasks) for dfsa in dfsas: + assert dfsa.job_id == j.job_id + assert dfsa.task_key in task_keys assert dfsa.source_type != DirectFsAccess.UNKNOWN assert dfsa.source_id != DirectFsAccess.UNKNOWN assert dfsa.source_lineage != DirectFsAccess.UNKNOWN From ea74ba68b7d50dcaba8123ac384679602f2126a7 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 6 Sep 2024 17:18:20 +0200 Subject: [PATCH 48/80] capture and store job/task infos --- src/databricks/labs/ucx/source_code/base.py | 10 ++++++++++ src/databricks/labs/ucx/source_code/jobs.py | 6 +++++- tests/integration/source_code/test_jobs.py | 5 +++-- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index ed5a0d11c1..ac29971ce2 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -350,7 +350,11 @@ class DirectFsAccess: source_id: str = UNKNOWN source_lineage: str = UNKNOWN source_timestamp: int = -1 + job_id: int = -1 + job_name: str = UNKNOWN + task_key: str = UNKNOWN + # pylint: disable=too-many-arguments def replace( self, path: str | None = None, @@ -360,6 +364,9 @@ def replace( source_id: str | None = None, source_lineage: str | None = None, source_timestamp: int | None = None, + job_id: int | None = None, + job_name: str | None = None, + task_key: str | None = None, ): return DirectFsAccess( path=path or self.path, @@ -369,4 +376,7 @@ def replace( source_id=source_id or self.source_id, source_lineage=source_lineage or self.source_lineage, source_timestamp=source_timestamp or self.source_timestamp, + job_id=job_id or self.job_id, + job_name=job_name or self.job_name, + task_key=task_key or self.task_key, ) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 6d74f9380b..36f5954dcc 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -427,7 +427,11 @@ def _lint_task(self, task: jobs.Task, job: jobs.Job, linted_paths: set[Path]) -> ) yield from walker collector = DfsaCollectorWalker(graph, set(), self._path_lookup, session_state) - dfsas = list(dfsa for dfsa in collector) + dfsas: list[DirectFsAccess] = [] + job_name = "" if job.settings is None else job.settings.name + for dfsa in collector: + dfsa = dfsa.replace(job_id=job.job_id, job_name=job_name, task_key=task.task_key) + dfsas.append(dfsa) self._directfs_crawlers.for_paths().append(dfsas) diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index 21b9e6ead9..c99328a7e2 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -158,12 +158,13 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job, assert len(dfsas) == 2 task_keys = set(task.task_key for task in j.settings.tasks) for dfsa in dfsas: - assert dfsa.job_id == j.job_id - assert dfsa.task_key in task_keys assert dfsa.source_type != DirectFsAccess.UNKNOWN assert dfsa.source_id != DirectFsAccess.UNKNOWN assert dfsa.source_lineage != DirectFsAccess.UNKNOWN assert dfsa.source_timestamp != -1 + assert dfsa.job_id == j.job_id + assert dfsa.job_name == j.job_name + assert dfsa.task_key in task_keys def test_workflow_linter_lints_job_with_import_pypi_library( From e75e07d9d85c58227daa616fe7b7f0fa959b7635 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 6 Sep 2024 17:23:37 +0200 Subject: [PATCH 49/80] simplify --- src/databricks/labs/ucx/source_code/base.py | 34 ++++++++++++++------- src/databricks/labs/ucx/source_code/jobs.py | 6 ++-- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index ac29971ce2..56ec62ce66 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -354,28 +354,40 @@ class DirectFsAccess: job_name: str = UNKNOWN task_key: str = UNKNOWN - # pylint: disable=too-many-arguments - def replace( + def replace_source( self, - path: str | None = None, - is_read: bool | None = None, - is_write: bool | None = None, source_type: str | None = None, source_id: str | None = None, source_lineage: str | None = None, source_timestamp: int | None = None, - job_id: int | None = None, - job_name: str | None = None, - task_key: str | None = None, ): return DirectFsAccess( - path=path or self.path, - is_read=is_read or self.is_read, - is_write=is_write or self.is_write, + path=self.path, + is_read=self.is_read, + is_write=self.is_write, source_type=source_type or self.source_type, source_id=source_id or self.source_id, source_lineage=source_lineage or self.source_lineage, source_timestamp=source_timestamp or self.source_timestamp, + job_id=self.job_id, + job_name=self.job_name, + task_key=self.task_key, + ) + + def replace_job_infos( + self, + job_id: int | None = None, + job_name: str | None = None, + task_key: str | None = None, + ): + return DirectFsAccess( + path=self.path, + is_read=self.is_read, + is_write=self.is_write, + source_type=self.source_type, + source_id=self.source_id, + source_lineage=self.source_lineage, + source_timestamp=self.source_timestamp, job_id=job_id or self.job_id, job_name=job_name or self.job_name, task_key=task_key or self.task_key, diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 36f5954dcc..f598ad81ac 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -430,7 +430,7 @@ def _lint_task(self, task: jobs.Task, job: jobs.Job, linted_paths: set[Path]) -> dfsas: list[DirectFsAccess] = [] job_name = "" if job.settings is None else job.settings.name for dfsa in collector: - dfsa = dfsa.replace(job_id=job.job_id, job_name=job_name, task_key=task.task_key) + dfsa = dfsa.replace_job_infos(job_id=job.job_id, job_name=job_name, task_key=task.task_key) dfsas.append(dfsa) self._directfs_crawlers.for_paths().append(dfsas) @@ -499,7 +499,7 @@ def _collect_from_notebook( src_id = str(path) src_lineage = self.lineage_str for dfsa in self._collect_from_source(cell.original_code, cell.language, path, inherited_tree): - yield dfsa.replace( + yield dfsa.replace_source( source_type="NOTEBOOK", source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp ) if cell.language is CellLanguage.PYTHON: @@ -523,7 +523,7 @@ def _collect_from_source( src_lineage = self.lineage_str src_timestamp = int(path.stat().st_mtime) for dfsa in iterable: - yield dfsa.replace( + yield dfsa.replace_source( source_type="FILE", source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp ) From e62fd185dbc509f0b039602934aa3f4b11345437 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 6 Sep 2024 17:50:21 +0200 Subject: [PATCH 50/80] capture and store assessment start/stop, also drop source_type --- src/databricks/labs/ucx/source_code/base.py | 31 ++++++++++++++----- src/databricks/labs/ucx/source_code/jobs.py | 15 ++++----- tests/integration/source_code/test_jobs.py | 2 ++ .../test_directfs_access_crawler.py | 12 ++++--- 4 files changed, 42 insertions(+), 18 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index 56ec62ce66..c37937e59f 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -346,17 +346,17 @@ class DirectFsAccess: path: str is_read: bool is_write: bool - source_type: str = UNKNOWN source_id: str = UNKNOWN - source_lineage: str = UNKNOWN source_timestamp: int = -1 + source_lineage: str = UNKNOWN job_id: int = -1 job_name: str = UNKNOWN task_key: str = UNKNOWN + assessment_start_timestamp: int = -1 + assessment_end_timestamp: int = -1 def replace_source( self, - source_type: str | None = None, source_id: str | None = None, source_lineage: str | None = None, source_timestamp: int | None = None, @@ -365,13 +365,14 @@ def replace_source( path=self.path, is_read=self.is_read, is_write=self.is_write, - source_type=source_type or self.source_type, source_id=source_id or self.source_id, - source_lineage=source_lineage or self.source_lineage, source_timestamp=source_timestamp or self.source_timestamp, + source_lineage=source_lineage or self.source_lineage, job_id=self.job_id, job_name=self.job_name, task_key=self.task_key, + assessment_start_timestamp=self.assessment_start_timestamp, + assessment_end_timestamp=self.assessment_start_timestamp, ) def replace_job_infos( @@ -384,11 +385,27 @@ def replace_job_infos( path=self.path, is_read=self.is_read, is_write=self.is_write, - source_type=self.source_type, source_id=self.source_id, - source_lineage=self.source_lineage, source_timestamp=self.source_timestamp, + source_lineage=self.source_lineage, job_id=job_id or self.job_id, job_name=job_name or self.job_name, task_key=task_key or self.task_key, + assessment_start_timestamp=self.assessment_start_timestamp, + assessment_end_timestamp=self.assessment_start_timestamp, + ) + + def replace_assessment_infos(self, assessment_start: int | None = None, assessment_end: int | None = None): + return DirectFsAccess( + path=self.path, + is_read=self.is_read, + is_write=self.is_write, + source_id=self.source_id, + source_timestamp=self.source_timestamp, + source_lineage=self.source_lineage, + job_id=self.job_id, + job_name=self.job_name, + task_key=self.task_key, + assessment_start_timestamp=assessment_start or self.assessment_start_timestamp, + assessment_end_timestamp=assessment_end or self.assessment_start_timestamp, ) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index f598ad81ac..1840d478a2 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -3,6 +3,7 @@ import logging import shutil import tempfile +import time from collections.abc import Generator, Iterable from contextlib import contextmanager from dataclasses import dataclass @@ -426,11 +427,15 @@ def _lint_task(self, task: jobs.Task, job: jobs.Job, linted_paths: set[Path]) -> graph, linted_paths, self._path_lookup, task.task_key, session_state, self._migration_index ) yield from walker + assessment_start = int(time.mktime(time.gmtime())) collector = DfsaCollectorWalker(graph, set(), self._path_lookup, session_state) + assessment_end = int(time.mktime(time.gmtime())) dfsas: list[DirectFsAccess] = [] - job_name = "" if job.settings is None else job.settings.name + assert job.settings is not None # as already done in _lint_job + job_name = job.settings.name for dfsa in collector: dfsa = dfsa.replace_job_infos(job_id=job.job_id, job_name=job_name, task_key=task.task_key) + dfsa = dfsa.replace_assessment_infos(assessment_start=assessment_start, assessment_end=assessment_end) dfsas.append(dfsa) self._directfs_crawlers.for_paths().append(dfsas) @@ -499,9 +504,7 @@ def _collect_from_notebook( src_id = str(path) src_lineage = self.lineage_str for dfsa in self._collect_from_source(cell.original_code, cell.language, path, inherited_tree): - yield dfsa.replace_source( - source_type="NOTEBOOK", source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp - ) + yield dfsa.replace_source(source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp) if cell.language is CellLanguage.PYTHON: if inherited_tree is None: inherited_tree = Tree.new_module() @@ -523,9 +526,7 @@ def _collect_from_source( src_lineage = self.lineage_str src_timestamp = int(path.stat().st_mtime) for dfsa in iterable: - yield dfsa.replace_source( - source_type="FILE", source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp - ) + yield dfsa.replace_source(source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp) def _collect_from_python(self, source: str, inherited_tree: Tree | None) -> Iterable[DirectFsAccess]: linter = DirectFsAccessPyLinter(self._session_state, prevent_spark_duplicates=False) diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index c99328a7e2..e1073110b7 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -165,6 +165,8 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job, assert dfsa.job_id == j.job_id assert dfsa.job_name == j.job_name assert dfsa.task_key in task_keys + assert dfsa.assessment_start_timestamp != -1 + assert dfsa.assessment_end_timestamp != -1 def test_workflow_linter_lints_job_with_import_pypi_library( diff --git a/tests/unit/source_code/test_directfs_access_crawler.py b/tests/unit/source_code/test_directfs_access_crawler.py index e34efad717..cd38435e66 100644 --- a/tests/unit/source_code/test_directfs_access_crawler.py +++ b/tests/unit/source_code/test_directfs_access_crawler.py @@ -9,13 +9,17 @@ def test_crawler_appends_dfsas(): crawler = DirectFsAccessCrawlers(backend, "schema").for_paths() dfsas = list( DirectFsAccess( - source_type="SOURCE", - source_id="ID", - source_lineage="LINEAGE", - source_timestamp=7452, path=path, is_read=False, is_write=False, + source_id="ID", + source_timestamp=7452, + source_lineage="LINEAGE", + job_id=222, + job_name="JOB", + task_key="TASK", + assessment_start_timestamp=123, + assessment_end_timestamp=234, ) for path in ("a", "b", "c") ) From b58f47dd2c9c84cf25de580fa06e65386d058689 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 6 Sep 2024 18:03:30 +0200 Subject: [PATCH 51/80] drop mock_dfsa_crawlers --- tests/unit/source_code/conftest.py | 5 ----- tests/unit/source_code/test_jobs.py | 6 ++++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/unit/source_code/conftest.py b/tests/unit/source_code/conftest.py index a4942c24ce..a5be3047ee 100644 --- a/tests/unit/source_code/conftest.py +++ b/tests/unit/source_code/conftest.py @@ -58,8 +58,3 @@ def simple_dependency_resolver(mock_path_lookup: PathLookup) -> DependencyResolv notebook_resolver = NotebookResolver(NotebookLoader()) import_resolver = ImportFileResolver(FileLoader(), allow_list) return DependencyResolver(library_resolver, notebook_resolver, import_resolver, import_resolver, mock_path_lookup) - - -@pytest.fixture -def mock_dfsa_crawlers() -> DirectFsAccessCrawlers: - return DirectFsAccessCrawlers(MockBackend(), "schema") diff --git a/tests/unit/source_code/test_jobs.py b/tests/unit/source_code/test_jobs.py index 7dff7a206b..8b9714a326 100644 --- a/tests/unit/source_code/test_jobs.py +++ b/tests/unit/source_code/test_jobs.py @@ -10,6 +10,7 @@ from databricks.labs.blueprint.paths import DBFSPath, WorkspacePath from databricks.labs.ucx.source_code.base import CurrentSessionState +from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawlers from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver from databricks.labs.ucx.source_code.known import KnownList from databricks.sdk import WorkspaceClient @@ -230,12 +231,13 @@ def test_workflow_task_container_builds_dependency_graph_spark_python_task( def test_workflow_linter_lint_job_logs_problems( - dependency_resolver, mock_path_lookup, empty_index, mock_dfsa_crawlers, caplog + dependency_resolver, mock_path_lookup, empty_index, caplog ): expected_message = "Found job problems:\nUNKNOWN:-1 [library-install-failed] 'pip --disable-pip-version-check install unknown-library" ws = create_autospec(WorkspaceClient) - linter = WorkflowLinter(ws, dependency_resolver, mock_path_lookup, empty_index, mock_dfsa_crawlers) + crawlers = create_autospec(DirectFsAccessCrawlers) + linter = WorkflowLinter(ws, dependency_resolver, mock_path_lookup, empty_index, crawlers) libraries = [compute.Library(pypi=compute.PythonPyPiLibrary(package="unknown-library-name"))] task = jobs.Task(task_key="test-task", libraries=libraries) From 5a5d4ffc8c7334618c2b07e02359c2a14bcb4367 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 6 Sep 2024 18:04:15 +0200 Subject: [PATCH 52/80] rename _backend -> _sql_backend --- .../labs/ucx/source_code/directfs_access_crawler.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/directfs_access_crawler.py b/src/databricks/labs/ucx/source_code/directfs_access_crawler.py index 09f442cd04..59423d453f 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access_crawler.py +++ b/src/databricks/labs/ucx/source_code/directfs_access_crawler.py @@ -38,12 +38,12 @@ def _crawl(self) -> Iterable[Result]: class DirectFsAccessCrawlers: - def __init__(self, backend: SqlBackend, schema: str): - self._backend = backend + def __init__(self, sql_backend: SqlBackend, schema: str): + self._sql_backend = sql_backend self._schema = schema def for_paths(self) -> _DirectFsAccessCrawler: - return _DirectFsAccessCrawler(self._backend, self._schema, "direct_file_system_access_in_paths") + return _DirectFsAccessCrawler(self._sql_backend, self._schema, "direct_file_system_access_in_paths") def for_queries(self) -> _DirectFsAccessCrawler: - return _DirectFsAccessCrawler(self._backend, self._schema, "direct_file_system_access_in_queries") + return _DirectFsAccessCrawler(self._sql_backend, self._schema, "direct_file_system_access_in_queries") From 404f6cdd6a55ecb1cc0112039004af295c716334 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 6 Sep 2024 18:05:20 +0200 Subject: [PATCH 53/80] rename _backend -> _sql_backend --- src/databricks/labs/ucx/source_code/directfs_access_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/source_code/directfs_access_crawler.py b/src/databricks/labs/ucx/source_code/directfs_access_crawler.py index 59423d453f..a05c2079db 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access_crawler.py +++ b/src/databricks/labs/ucx/source_code/directfs_access_crawler.py @@ -17,7 +17,7 @@ def __init__(self, backend: SqlBackend, schema: str, table: str): Initializes a DFSACrawler instance. Args: - backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark) + sql_backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark) schema: The schema name for the inventory persistence. """ super().__init__(backend, "hive_metastore", schema, table, DirectFsAccess) From cb6e45d409a29efaf249caedf2638a59991b1bf5 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 6 Sep 2024 18:14:38 +0200 Subject: [PATCH 54/80] hdfs is irrelevant, replace with dbfs --- .../samples/functional/file-access/select_format.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/source_code/samples/functional/file-access/select_format.sql b/tests/unit/source_code/samples/functional/file-access/select_format.sql index d64358a23d..d2a16bbe4c 100644 --- a/tests/unit/source_code/samples/functional/file-access/select_format.sql +++ b/tests/unit/source_code/samples/functional/file-access/select_format.sql @@ -1,3 +1,3 @@ -- Databricks notebook source --- ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: hdfs://examples/src/main/resources/users.parquet -SELECT * FROM parquet.`hdfs://examples/src/main/resources/users.parquet` +-- ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: dbfs://examples/src/main/resources/users.parquet +SELECT * FROM parquet.`dbfs://examples/src/main/resources/users.parquet` From 733deccbdab5e35ed165ed1f5345d41dacd73c30 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 9 Sep 2024 10:45:32 +0200 Subject: [PATCH 55/80] drop mock of DirectFsAccessCrawlers --- tests/unit/source_code/conftest.py | 2 -- tests/unit/source_code/test_jobs.py | 10 ++++------ 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/unit/source_code/conftest.py b/tests/unit/source_code/conftest.py index a5be3047ee..4b86de4f13 100644 --- a/tests/unit/source_code/conftest.py +++ b/tests/unit/source_code/conftest.py @@ -1,12 +1,10 @@ import pytest -from databricks.labs.lsql.backends import MockBackend from databricks.labs.ucx.hive_metastore.migration_status import ( MigrationStatus, ) from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex -from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawlers from databricks.labs.ucx.source_code.graph import DependencyResolver from databricks.labs.ucx.source_code.known import KnownList from databricks.labs.ucx.source_code.linters.files import ImportFileResolver, FileLoader diff --git a/tests/unit/source_code/test_jobs.py b/tests/unit/source_code/test_jobs.py index 8b9714a326..ba8b9287a8 100644 --- a/tests/unit/source_code/test_jobs.py +++ b/tests/unit/source_code/test_jobs.py @@ -230,14 +230,13 @@ def test_workflow_task_container_builds_dependency_graph_spark_python_task( assert registered_notebooks == [expected_path_instance] -def test_workflow_linter_lint_job_logs_problems( - dependency_resolver, mock_path_lookup, empty_index, caplog -): +def test_workflow_linter_lint_job_logs_problems(dependency_resolver, mock_path_lookup, empty_index, caplog): expected_message = "Found job problems:\nUNKNOWN:-1 [library-install-failed] 'pip --disable-pip-version-check install unknown-library" ws = create_autospec(WorkspaceClient) - crawlers = create_autospec(DirectFsAccessCrawlers) - linter = WorkflowLinter(ws, dependency_resolver, mock_path_lookup, empty_index, crawlers) + # pylint: disable=mock-no-usage + dfsas = create_autospec(DirectFsAccessCrawlers) + linter = WorkflowLinter(ws, dependency_resolver, mock_path_lookup, empty_index, dfsas) libraries = [compute.Library(pypi=compute.PythonPyPiLibrary(package="unknown-library-name"))] task = jobs.Task(task_key="test-task", libraries=libraries) @@ -245,7 +244,6 @@ def test_workflow_linter_lint_job_logs_problems( job = jobs.Job(job_id=1234, settings=settings) ws.jobs.get.return_value = job - with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.source_code.jobs"): linter.lint_job(1234) From 6338af6680f64e0b6bd2f85e93e5f8890b003ba3 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 9 Sep 2024 10:46:05 +0200 Subject: [PATCH 56/80] gather and store dfsas from refresh_report --- src/databricks/labs/ucx/source_code/jobs.py | 72 ++++++++++++++------- 1 file changed, 48 insertions(+), 24 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 1840d478a2..d59dc394f9 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -1,5 +1,4 @@ import functools -import itertools import logging import shutil import tempfile @@ -353,42 +352,51 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str): continue tasks.append(functools.partial(self.lint_job, job.job_id)) logger.info(f"Running {tasks} linting tasks in parallel...") - job_problems, errors = Threads.gather('linting workflows', tasks) - job_problems_flattened = list(itertools.chain(*job_problems)) - logger.info(f"Saving {len(job_problems_flattened)} linting problems...") + job_results, errors = Threads.gather('linting workflows', tasks) + job_problems: list[JobProblem] = [] + job_dfsas: list[DirectFsAccess] = [] + for problems, dfsas in job_results: + job_problems.extend(problems) + job_dfsas.extend(dfsas) + logger.info(f"Saving {len(job_problems)} linting problems...") sql_backend.save_table( f'{inventory_database}.workflow_problems', - job_problems_flattened, + job_problems, JobProblem, mode='overwrite', ) + self._directfs_crawlers.for_paths().append(job_dfsas) if len(errors) > 0: raise ManyError(errors) - def lint_job(self, job_id: int) -> list[JobProblem]: + def lint_job(self, job_id: int) -> tuple[list[JobProblem], list[DirectFsAccess]]: try: job = self._ws.jobs.get(job_id) except NotFound: logger.warning(f'Could not find job: {job_id}') - return [] + return ([], []) - problems = self._lint_job(job) + problems, dfsas = self._lint_job(job) if len(problems) > 0: problem_messages = "\n".join([problem.as_message() for problem in problems]) logger.warning(f"Found job problems:\n{problem_messages}") - return problems + return problems, dfsas _UNKNOWN = Path('') - def _lint_job(self, job: jobs.Job) -> list[JobProblem]: + def _lint_job(self, job: jobs.Job) -> tuple[list[JobProblem], list[DirectFsAccess]]: problems: list[JobProblem] = [] + dfsas: list[DirectFsAccess] = [] assert job.job_id is not None assert job.settings is not None assert job.settings.name is not None assert job.settings.tasks is not None linted_paths: set[Path] = set() for task in job.settings.tasks: - for advice in self._lint_task(task, job, linted_paths): + graph, advices, session_state = self._build_task_dependency_graph(task, job) + if not advices: + advices = self._lint_task(task, graph, session_state, linted_paths) + for advice in advices: absolute_path = advice.path.absolute().as_posix() if advice.path != self._UNKNOWN else 'UNKNOWN' job_problem = JobProblem( job_id=job.job_id, @@ -403,9 +411,17 @@ def _lint_job(self, job: jobs.Job) -> list[JobProblem]: end_col=advice.advice.end_col, ) problems.append(job_problem) - return problems - - def _lint_task(self, task: jobs.Task, job: jobs.Job, linted_paths: set[Path]) -> Iterable[LocatedAdvice]: + assessment_start = int(time.mktime(time.gmtime())) + task_dfsas = self._collect_task_dfsas(task, job, graph, session_state) + assessment_end = int(time.mktime(time.gmtime())) + for dfsa in task_dfsas: + dfsa = dfsa.replace_assessment_infos(assessment_start=assessment_start, assessment_end=assessment_end) + dfsas.append(dfsa) + return problems, dfsas + + def _build_task_dependency_graph( + self, task: jobs.Task, job: jobs.Job + ) -> tuple[DependencyGraph, Iterable[LocatedAdvice], CurrentSessionState]: root_dependency: Dependency = WorkflowTask(self._ws, task, job) # we can load it without further preparation since the WorkflowTask is merely a wrapper container = root_dependency.load(self._path_lookup) @@ -418,25 +434,33 @@ def _lint_task(self, task: jobs.Task, job: jobs.Job, linted_paths: set[Path]) -> ) graph = DependencyGraph(root_dependency, None, self._resolver, self._path_lookup, session_state) problems = container.build_dependency_graph(graph) - if problems: - for problem in problems: - source_path = self._UNKNOWN if problem.is_path_missing() else problem.source_path - yield LocatedAdvice(problem.as_advisory(), source_path) - return + located_advices: list[LocatedAdvice] = [] + for problem in problems: + source_path = self._UNKNOWN if problem.is_path_missing() else problem.source_path + located_advices.append(LocatedAdvice(problem.as_advisory(), source_path)) + return graph, located_advices, session_state + + def _lint_task( + self, + task: jobs.Task, + graph: DependencyGraph, + session_state: CurrentSessionState, + linted_paths: set[Path], + ) -> Iterable[LocatedAdvice]: walker = LintingWalker( graph, linted_paths, self._path_lookup, task.task_key, session_state, self._migration_index ) yield from walker - assessment_start = int(time.mktime(time.gmtime())) + + def _collect_task_dfsas( + self, task: jobs.Task, job: jobs.Job, graph: DependencyGraph, session_state: CurrentSessionState + ) -> Iterable[DirectFsAccess]: collector = DfsaCollectorWalker(graph, set(), self._path_lookup, session_state) - assessment_end = int(time.mktime(time.gmtime())) dfsas: list[DirectFsAccess] = [] assert job.settings is not None # as already done in _lint_job job_name = job.settings.name for dfsa in collector: - dfsa = dfsa.replace_job_infos(job_id=job.job_id, job_name=job_name, task_key=task.task_key) - dfsa = dfsa.replace_assessment_infos(assessment_start=assessment_start, assessment_end=assessment_end) - dfsas.append(dfsa) + yield dfsa.replace_job_infos(job_id=job.job_id, job_name=job_name, task_key=task.task_key) self._directfs_crawlers.for_paths().append(dfsas) From 166e34ca62877faeeeab1a22d7ea1fd524331cbf Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 9 Sep 2024 12:20:34 +0200 Subject: [PATCH 57/80] prevent pylint warning --- tests/unit/source_code/test_jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/source_code/test_jobs.py b/tests/unit/source_code/test_jobs.py index ba8b9287a8..e2293bd375 100644 --- a/tests/unit/source_code/test_jobs.py +++ b/tests/unit/source_code/test_jobs.py @@ -234,7 +234,6 @@ def test_workflow_linter_lint_job_logs_problems(dependency_resolver, mock_path_l expected_message = "Found job problems:\nUNKNOWN:-1 [library-install-failed] 'pip --disable-pip-version-check install unknown-library" ws = create_autospec(WorkspaceClient) - # pylint: disable=mock-no-usage dfsas = create_autospec(DirectFsAccessCrawlers) linter = WorkflowLinter(ws, dependency_resolver, mock_path_lookup, empty_index, dfsas) @@ -247,6 +246,7 @@ def test_workflow_linter_lint_job_logs_problems(dependency_resolver, mock_path_l with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.source_code.jobs"): linter.lint_job(1234) + dfsas.assert_not_called() assert any(message.startswith(expected_message) for message in caplog.messages) From 024931063c4c7f1f5092122dad6efb4d56ae81d2 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 9 Sep 2024 17:03:21 +0200 Subject: [PATCH 58/80] fix failing tests --- tests/integration/source_code/test_jobs.py | 43 +++++++++++----------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index e1073110b7..d63941a93e 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -69,7 +69,7 @@ def test_linter_from_context(simple_ctx, make_job, make_notebook): def test_job_linter_no_problems(simple_ctx, make_job): j = make_job() - problems = simple_ctx.workflow_linter.lint_job(j.job_id) + problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id) assert len(problems) == 0 @@ -92,7 +92,7 @@ def test_job_task_linter_library_not_installed_cluster( ) j = make_job(tasks=[task]) - problems = simple_ctx.workflow_linter.lint_job(j.job_id) + problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id) assert len([problem for problem in problems if problem.message == "Could not locate import: greenlet"]) == 1 @@ -116,7 +116,7 @@ def test_job_task_linter_library_installed_cluster( ) j = make_job(tasks=[task]) - problems = simple_ctx.workflow_linter.lint_job(j.job_id) + problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id) assert len([problem for problem in problems if problem.message == "Could not locate import: greenlet"]) == 0 @@ -145,7 +145,7 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job, some_file.write_text('display(spark.read.parquet("/mnt/foo/bar"))') with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.source_code.jobs"): - problems = simple_ctx.workflow_linter.lint_job(j.job_id) + problems, dfsas = simple_ctx.workflow_linter.lint_job(j.job_id) root = Path(entrypoint.as_posix()) messages = {replace(p, path=Path(p.path).relative_to(root)).as_message() for p in problems} @@ -154,7 +154,6 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job, last_messages = caplog.messages[-1].split("\n") assert all(any(message.endswith(expected) for message in last_messages) for expected in expected_messages) - dfsas = list(simple_ctx.directfs_access_crawlers.for_paths().snapshot()) assert len(dfsas) == 2 task_keys = set(task.task_key for task in j.settings.tasks) for dfsa in dfsas: @@ -187,14 +186,14 @@ def test_workflow_linter_lints_job_with_import_pypi_library( make_notebook(path=notebook, content=b"import greenlet") job_without_pytest_library = make_job(notebook_path=notebook) - problems = simple_ctx.workflow_linter.lint_job(job_without_pytest_library.job_id) + problems, _dfsas = simple_ctx.workflow_linter.lint_job(job_without_pytest_library.job_id) assert len([problem for problem in problems if problem.message == "Could not locate import: greenlet"]) > 0 library = compute.Library(pypi=compute.PythonPyPiLibrary(package="greenlet")) job_with_pytest_library = make_job(notebook_path=notebook, libraries=[library]) - problems = simple_ctx.workflow_linter.lint_job(job_with_pytest_library.job_id) + problems, _dfsas = simple_ctx.workflow_linter.lint_job(job_with_pytest_library.job_id) assert len([problem for problem in problems if problem.message == "Could not locate import: greenlet"]) == 0 @@ -305,7 +304,7 @@ def test_workflow_linter_lints_job_with_workspace_requirements_dependency( notebook = make_notebook(path=f"{entrypoint}/notebook.ipynb", content=python_code.encode("utf-8")) job_with_pytest_library = make_job(notebook_path=notebook, libraries=[library]) - problems = simple_ctx.workflow_linter.lint_job(job_with_pytest_library.job_id) + problems, _dfsas = simple_ctx.workflow_linter.lint_job(job_with_pytest_library.job_id) messages = tuple(problem.message for problem in problems) expected_messages = ( "ERROR: Could not find a version that satisfies the requirement a_package_that_does_not_exist", @@ -339,7 +338,7 @@ def test_workflow_linter_lints_job_with_dbfs_requirements_dependency( notebook = make_notebook(path=f"{entrypoint}/notebook.ipynb", content=python_code.encode("utf-8")) job_with_pytest_library = make_job(notebook_path=notebook, libraries=[library]) - problems = simple_ctx.workflow_linter.lint_job(job_with_pytest_library.job_id) + problems, _dfsas = simple_ctx.workflow_linter.lint_job(job_with_pytest_library.job_id) messages = tuple(problem.message for problem in problems) expected_messages = ( "ERROR: Could not find a version that satisfies the requirement a_package_that_does_not_exist", @@ -371,7 +370,7 @@ def test_workflow_linter_lints_job_with_workspace_egg_dependency( notebook = make_notebook(path=f"{entrypoint}/notebook.ipynb", content=b"import thingy\n") job_with_egg_dependency = make_job(notebook_path=notebook, libraries=[library]) - problems = simple_ctx.workflow_linter.lint_job(job_with_egg_dependency.job_id) + problems, _dfsas = simple_ctx.workflow_linter.lint_job(job_with_egg_dependency.job_id) assert not [problem for problem in problems if problem.message == expected_problem_message] @@ -396,7 +395,7 @@ def test_workflow_linter_lints_job_with_dbfs_egg_dependency( notebook = make_notebook(path=f"{entrypoint}/notebook.ipynb", content=b"import thingy\n") job_with_egg_dependency = make_job(notebook_path=notebook, libraries=[library]) - problems = simple_ctx.workflow_linter.lint_job(job_with_egg_dependency.job_id) + problems,_dfsas = simple_ctx.workflow_linter.lint_job(job_with_egg_dependency.job_id) assert not [problem for problem in problems if problem.message == expected_problem_message] @@ -414,7 +413,7 @@ def test_workflow_linter_lints_job_with_missing_library(simple_ctx, make_job, ma notebook = make_notebook(path=f"{make_directory()}/notebook.ipynb", content=b"import databricks.labs.ucx") job_without_ucx_library = make_job(notebook_path=notebook) - problems = simple_ctx.workflow_linter.lint_job(job_without_ucx_library.job_id) + problems, _dfsas = simple_ctx.workflow_linter.lint_job(job_without_ucx_library.job_id) assert len([problem for problem in problems if problem.message == expected_problem_message]) > 0 allow_list.module_compatibility.assert_called_once_with("databricks.labs.ucx") @@ -435,7 +434,7 @@ def test_workflow_linter_lints_job_with_wheel_dependency(simple_ctx, make_job, m notebook = make_notebook(path=f"{make_directory()}/notebook.ipynb", content=b"import databricks.labs.ucx") job_with_ucx_library = make_job(notebook_path=notebook, libraries=[library]) - problems = simple_ctx.workflow_linter.lint_job(job_with_ucx_library.job_id) + problems, _dfsas = simple_ctx.workflow_linter.lint_job(job_with_ucx_library.job_id) assert len([problem for problem in problems if problem.message == expected_problem_message]) == 0 @@ -463,7 +462,7 @@ def test_job_spark_python_task_linter_happy_path( ) j = make_job(tasks=[task]) - problems = simple_ctx.workflow_linter.lint_job(j.job_id) + problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id) assert len([problem for problem in problems if problem.message == "Could not locate import: greenlet"]) == 0 @@ -484,7 +483,7 @@ def test_job_spark_python_task_linter_unhappy_path( ) j = make_job(tasks=[task]) - problems = simple_ctx.workflow_linter.lint_job(j.job_id) + problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id) assert len([problem for problem in problems if problem.message == "Could not locate import: greenlet"]) == 1 @@ -516,7 +515,7 @@ def test_workflow_linter_lints_python_wheel_task(simple_ctx, ws, make_job, make_ ) job_with_ucx_library = make_job(tasks=[task]) - problems = simple_ctx.workflow_linter.lint_job(job_with_ucx_library.job_id) + problems, _dfsas = simple_ctx.workflow_linter.lint_job(job_with_ucx_library.job_id) assert len([problem for problem in problems if problem.code == "library-dist-info-not-found"]) == 0 assert len([problem for problem in problems if problem.code == "library-entrypoint-not-found"]) == 0 @@ -542,7 +541,7 @@ def test_job_spark_python_task_workspace_linter_happy_path( ) j = make_job(tasks=[task]) - problems = simple_ctx.workflow_linter.lint_job(j.job_id) + problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id) assert not [problem for problem in problems if problem.message == "Could not locate import: greenlet"] @@ -565,7 +564,7 @@ def test_job_spark_python_task_dbfs_linter_happy_path( ) j = make_job(tasks=[task]) - problems = simple_ctx.workflow_linter.lint_job(j.job_id) + problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id) assert not [problem for problem in problems if problem.message == "Could not locate import: greenlet"] @@ -593,7 +592,7 @@ def test_job_spark_python_task_linter_notebook_handling( ) j = make_job(tasks=[task]) - problems = simple_ctx.workflow_linter.lint_job(j.job_id) + problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id) # The notebook being linted has 'import greenlet' in a cell that should be ignored, but will trigger this problem if processed. assert not [problem for problem in problems if problem.message == "Could not locate import: greenlet"] @@ -618,7 +617,7 @@ def test_job_dlt_task_linter_unhappy_path( ) j = make_job(tasks=[task]) - problems = simple_ctx.workflow_linter.lint_job(j.job_id) + problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id) assert len([problem for problem in problems if problem.message == "Could not locate import: greenlet"]) == 1 @@ -643,7 +642,7 @@ def test_job_dlt_task_linter_happy_path( ) j = make_job(tasks=[task]) - problems = simple_ctx.workflow_linter.lint_job(j.job_id) + problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id) assert len([problem for problem in problems if problem.message == "Could not locate import: greenlet"]) == 0 @@ -657,7 +656,7 @@ def test_job_dependency_problem_egg_dbr14plus(make_job, make_directory, make_not j = make_job(libraries=[library]) - problems = simple_ctx.workflow_linter.lint_job(j.job_id) + problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id) assert ( len( [ From b605b777beb73b2fe3172c7236a043e230e81a14 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 9 Sep 2024 17:15:38 +0200 Subject: [PATCH 59/80] formatting --- tests/integration/source_code/test_jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index d63941a93e..db53bbe9c7 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -395,7 +395,7 @@ def test_workflow_linter_lints_job_with_dbfs_egg_dependency( notebook = make_notebook(path=f"{entrypoint}/notebook.ipynb", content=b"import thingy\n") job_with_egg_dependency = make_job(notebook_path=notebook, libraries=[library]) - problems,_dfsas = simple_ctx.workflow_linter.lint_job(job_with_egg_dependency.job_id) + problems, _dfsas = simple_ctx.workflow_linter.lint_job(job_with_egg_dependency.job_id) assert not [problem for problem in problems if problem.message == expected_problem_message] From 7f9fa061ea714d2915e38500d3ae5a27fe12a5d1 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 9 Sep 2024 17:27:09 +0200 Subject: [PATCH 60/80] fix failing tests --- src/databricks/labs/ucx/source_code/jobs.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index d59dc394f9..a42b499a11 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -11,7 +11,7 @@ from urllib import parse from databricks.labs.blueprint.parallel import ManyError, Threads -from databricks.labs.blueprint.paths import DBFSPath +from databricks.labs.blueprint.paths import DBFSPath, WorkspacePath from databricks.labs.lsql.backends import SqlBackend from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound @@ -524,7 +524,12 @@ def _collect_from_notebook( ) -> Iterable[DirectFsAccess]: notebook = Notebook.parse(path, source, language.language) for cell in notebook.cells: - src_timestamp = int(path.stat().st_mtime) + if isinstance(path, WorkspacePath): + # TODO add modified_at property in lsql, see https://github.com/databrickslabs/lsql/issues/268 + # pylint: disable=protected-access + src_timestamp = path._object_info.modified_at + else: + src_timestamp = int(path.stat().st_mtime) src_id = str(path) src_lineage = self.lineage_str for dfsa in self._collect_from_source(cell.original_code, cell.language, path, inherited_tree): @@ -548,7 +553,12 @@ def _collect_from_source( return src_id = str(path) src_lineage = self.lineage_str - src_timestamp = int(path.stat().st_mtime) + if isinstance(path, WorkspacePath): + # TODO add modified_at property in lsql, see https://github.com/databrickslabs/lsql/issues/268 + # pylint: disable=protected-access + src_timestamp = path._object_info.modified_at + else: + src_timestamp = int(path.stat().st_mtime) for dfsa in iterable: yield dfsa.replace_source(source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp) From f7108cb35fa7ed04ef38f474ad81da066b5df659 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 9 Sep 2024 17:46:45 +0200 Subject: [PATCH 61/80] fix failing tests --- src/databricks/labs/ucx/mixins/fixtures.py | 2 +- src/databricks/labs/ucx/source_code/jobs.py | 2 +- tests/integration/source_code/test_jobs.py | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/mixins/fixtures.py b/src/databricks/labs/ucx/mixins/fixtures.py index abc6556fec..f4de9e0351 100644 --- a/src/databricks/labs/ucx/mixins/fixtures.py +++ b/src/databricks/labs/ucx/mixins/fixtures.py @@ -831,7 +831,7 @@ def create(notebook_path: str | Path | None = None, **kwargs): job = ws.jobs.create(**kwargs) logger.info(f"Job: {ws.config.host}#job/{job.job_id}") - return job + return ws.jobs.get(job.job_id) yield from factory("job", create, lambda item: ws.jobs.delete(item.job_id)) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index a42b499a11..b09ff7088a 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -554,7 +554,7 @@ def _collect_from_source( src_id = str(path) src_lineage = self.lineage_str if isinstance(path, WorkspacePath): - # TODO add modified_at property in lsql, see https://github.com/databrickslabs/lsql/issues/268 + # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142 # pylint: disable=protected-access src_timestamp = path._object_info.modified_at else: diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index db53bbe9c7..11fde9dfe8 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -157,12 +157,11 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job, assert len(dfsas) == 2 task_keys = set(task.task_key for task in j.settings.tasks) for dfsa in dfsas: - assert dfsa.source_type != DirectFsAccess.UNKNOWN assert dfsa.source_id != DirectFsAccess.UNKNOWN assert dfsa.source_lineage != DirectFsAccess.UNKNOWN assert dfsa.source_timestamp != -1 assert dfsa.job_id == j.job_id - assert dfsa.job_name == j.job_name + assert dfsa.job_name == j.settings.name assert dfsa.task_key in task_keys assert dfsa.assessment_start_timestamp != -1 assert dfsa.assessment_end_timestamp != -1 From abcab8773be2b0ca466fa15503bcbc4539f385af Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 9 Sep 2024 17:54:43 +0200 Subject: [PATCH 62/80] fix failing tests --- src/databricks/labs/ucx/source_code/jobs.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index b09ff7088a..6cb3daf4b1 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -527,7 +527,11 @@ def _collect_from_notebook( if isinstance(path, WorkspacePath): # TODO add modified_at property in lsql, see https://github.com/databrickslabs/lsql/issues/268 # pylint: disable=protected-access - src_timestamp = path._object_info.modified_at + src_timestamp = path._object_info.modified_at or -1 + elif isinstance(path, DBFSPath): + # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143 + # pylint: disable=protected-access + src_timestamp = path._file_info.modification_time or -1 else: src_timestamp = int(path.stat().st_mtime) src_id = str(path) @@ -556,7 +560,11 @@ def _collect_from_source( if isinstance(path, WorkspacePath): # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142 # pylint: disable=protected-access - src_timestamp = path._object_info.modified_at + src_timestamp = path._object_info.modified_at or -1 + elif isinstance(path, DBFSPath): + # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143 + # pylint: disable=protected-access + src_timestamp = path._file_info.modification_time or -1 else: src_timestamp = int(path.stat().st_mtime) for dfsa in iterable: From eb603e4e65e3c5eb837477dfae3af9fbe7b775be Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 10 Sep 2024 09:37:25 +0200 Subject: [PATCH 63/80] catch infinite recursion --- src/databricks/labs/ucx/source_code/graph.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py index 0f4804db67..1ed9e2a360 100644 --- a/src/databricks/labs/ucx/source_code/graph.py +++ b/src/databricks/labs/ucx/source_code/graph.py @@ -203,6 +203,12 @@ def root_relative_names(self) -> set[str]: # when visit_node returns True it interrupts the visit def visit(self, visit_node: Callable[[DependencyGraph], bool | None], visited: set[Path] | None) -> bool: + try: + return self._visit(visit_node, visited) + except RecursionError as e: + return False + + def _visit(self, visit_node: Callable[[DependencyGraph], bool | None], visited: set[Path] | None) -> bool: """provide visited set if you want to ensure nodes are only visited once""" if visited is not None: path = self.dependency.path From 837fe6e9473d9f2a4b58d11e4577abffbe9db44f Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 10 Sep 2024 09:37:44 +0200 Subject: [PATCH 64/80] drop legacy code --- src/databricks/labs/ucx/source_code/jobs.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 6cb3daf4b1..a43de3af16 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -456,12 +456,10 @@ def _collect_task_dfsas( self, task: jobs.Task, job: jobs.Job, graph: DependencyGraph, session_state: CurrentSessionState ) -> Iterable[DirectFsAccess]: collector = DfsaCollectorWalker(graph, set(), self._path_lookup, session_state) - dfsas: list[DirectFsAccess] = [] assert job.settings is not None # as already done in _lint_job job_name = job.settings.name for dfsa in collector: yield dfsa.replace_job_infos(job_id=job.job_id, job_name=job_name, task_key=task.task_key) - self._directfs_crawlers.for_paths().append(dfsas) class LintingWalker(DependencyGraphWalker[LocatedAdvice]): From e80b6f034bbf30dbb4b75b655623433d008086c7 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 10 Sep 2024 11:54:44 +0200 Subject: [PATCH 65/80] Revert "catch infinite recursion" This reverts commit eb603e4e65e3c5eb837477dfae3af9fbe7b775be. --- src/databricks/labs/ucx/source_code/graph.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py index 1ed9e2a360..0f4804db67 100644 --- a/src/databricks/labs/ucx/source_code/graph.py +++ b/src/databricks/labs/ucx/source_code/graph.py @@ -203,12 +203,6 @@ def root_relative_names(self) -> set[str]: # when visit_node returns True it interrupts the visit def visit(self, visit_node: Callable[[DependencyGraph], bool | None], visited: set[Path] | None) -> bool: - try: - return self._visit(visit_node, visited) - except RecursionError as e: - return False - - def _visit(self, visit_node: Callable[[DependencyGraph], bool | None], visited: set[Path] | None) -> bool: """provide visited set if you want to ensure nodes are only visited once""" if visited is not None: path = self.dependency.path From f8cdc22030750424fa0b0988eadb8d7d6d3370c7 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 10 Sep 2024 16:50:44 +0200 Subject: [PATCH 66/80] Use structured lineage for DependencyGraph (#2556) ## Changes Use structured lineage for DependencyGraph ### Linked issues Resolves #2550 ### Functionality None ### Tests - [x] added unit tests --------- Co-authored-by: Eric Vergnaud --- src/databricks/labs/ucx/source_code/graph.py | 32 ++++++++++++---- src/databricks/labs/ucx/source_code/jobs.py | 40 +++++++++++--------- tests/unit/source_code/test_jobs.py | 32 ++++++++++++++-- 3 files changed, 77 insertions(+), 27 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py index 0f4804db67..6ad90965d6 100644 --- a/src/databricks/labs/ucx/source_code/graph.py +++ b/src/databricks/labs/ucx/source_code/graph.py @@ -1,6 +1,8 @@ from __future__ import annotations import abc +import itertools +import json import logging from dataclasses import dataclass from pathlib import Path @@ -291,11 +293,10 @@ class DependencyGraphContext: class Dependency: - def __init__(self, loader: DependencyLoader, path: Path, inherits_context=True, lineage_str: str | None = None): + def __init__(self, loader: DependencyLoader, path: Path, inherits_context=True): self._loader = loader self._path = path self._inherits_context = inherits_context - self._lineage_str = lineage_str or '"' + self._path.as_posix() + '"' @property def path(self) -> Path: @@ -318,8 +319,8 @@ def __repr__(self): return f"Dependency<{self.path}>" @property - def lineage_str(self): - return self._lineage_str + def lineage(self) -> list[LineageAtom]: + return [LineageAtom("path", str(self.path))] class SourceContainer(abc.ABC): @@ -589,6 +590,23 @@ def finalize(self) -> InheritedContext: return InheritedContext(tree, self.found) +@dataclass +class LineageAtom: + + @staticmethod + def atoms_to_json_string(atoms: list[LineageAtom]): + json_lists = list(lineage.as_objects() for lineage in atoms) + json_obj = list(itertools.chain(*json_lists)) + return json.dumps(json_obj) + + object_type: str + object_id: str + other: dict[str, str] | None = None + + def as_objects(self) -> list[dict[str, str]]: + return [{"object_type": self.object_type, "object_id": self.object_id, **(self.other or {})}] + + T = TypeVar("T") @@ -634,6 +652,6 @@ def _process_dependency( ) -> Iterable[T]: ... @property - def lineage_str(self): - parts = [dependency.lineage_str for dependency in self._lineage] - return "->".join(parts) + def lineage(self) -> list[LineageAtom]: + lineages = [dependency.lineage for dependency in self._lineage] + return list(itertools.chain(*lineages)) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index a43de3af16..8cf02441a5 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -37,6 +37,7 @@ SourceContainer, WrappingLoader, DependencyGraphWalker, + LineageAtom, ) from databricks.labs.ucx.source_code.linters.context import LinterContext from databricks.labs.ucx.source_code.linters.directfs import DirectFsAccessPyLinter, DirectFsAccessSqlLinter @@ -68,10 +69,8 @@ def as_message(self) -> str: class WorkflowTask(Dependency): def __init__(self, ws: WorkspaceClient, task: jobs.Task, job: jobs.Job): - # concat job and task for lineage, see DependencyGraphWalker.lineage_str - lineage_str = f'"job:{job.job_id}"->"task:{task.task_key}"' loader = WrappingLoader(WorkflowTaskContainer(ws, task, job)) - super().__init__(loader, Path(f'/jobs/{task.task_key}'), inherits_context=False, lineage_str=lineage_str) + super().__init__(loader, Path(f'/jobs/{task.task_key}'), inherits_context=False) self._task = task self._job = job @@ -81,6 +80,13 @@ def load(self, path_lookup: PathLookup) -> SourceContainer | None: def __repr__(self): return f'WorkflowTask<{self._task.task_key} of {self._job.settings.name}>' + @property + def lineage(self) -> list[LineageAtom]: + job_name = ("" if self._job.settings is None else self._job.settings.name) or "unknown job" + job_lineage = LineageAtom("job", str(self._job.job_id), {"name": job_name}) + task_lineage = LineageAtom("task", self._task.task_key) + return [job_lineage, task_lineage] + class WorkflowTaskContainer(SourceContainer): def __init__(self, ws: WorkspaceClient, task: jobs.Task, job: jobs.Job): @@ -521,19 +527,19 @@ def _collect_from_notebook( self, source: str, language: CellLanguage, path: Path, inherited_tree: Tree | None ) -> Iterable[DirectFsAccess]: notebook = Notebook.parse(path, source, language.language) + if isinstance(path, WorkspacePath): + # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142 + # pylint: disable=protected-access + src_timestamp = path._object_info.modified_at or -1 + elif isinstance(path, DBFSPath): + # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143 + # pylint: disable=protected-access + src_timestamp = path._file_info.modification_time or -1 + else: + src_timestamp = int(path.stat().st_mtime) + src_id = str(path) + src_lineage = LineageAtom.atoms_to_json_string(self.lineage) for cell in notebook.cells: - if isinstance(path, WorkspacePath): - # TODO add modified_at property in lsql, see https://github.com/databrickslabs/lsql/issues/268 - # pylint: disable=protected-access - src_timestamp = path._object_info.modified_at or -1 - elif isinstance(path, DBFSPath): - # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143 - # pylint: disable=protected-access - src_timestamp = path._file_info.modification_time or -1 - else: - src_timestamp = int(path.stat().st_mtime) - src_id = str(path) - src_lineage = self.lineage_str for dfsa in self._collect_from_source(cell.original_code, cell.language, path, inherited_tree): yield dfsa.replace_source(source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp) if cell.language is CellLanguage.PYTHON: @@ -553,8 +559,6 @@ def _collect_from_source( if iterable is None: logger.warning(f"Language {language.name} not supported yet!") return - src_id = str(path) - src_lineage = self.lineage_str if isinstance(path, WorkspacePath): # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142 # pylint: disable=protected-access @@ -565,6 +569,8 @@ def _collect_from_source( src_timestamp = path._file_info.modification_time or -1 else: src_timestamp = int(path.stat().st_mtime) + src_id = str(path) + src_lineage = LineageAtom.atoms_to_json_string(self.lineage) for dfsa in iterable: yield dfsa.replace_source(source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp) diff --git a/tests/unit/source_code/test_jobs.py b/tests/unit/source_code/test_jobs.py index e2293bd375..7b441764b2 100644 --- a/tests/unit/source_code/test_jobs.py +++ b/tests/unit/source_code/test_jobs.py @@ -1,11 +1,12 @@ import io +import itertools import logging import textwrap from pathlib import Path from unittest.mock import create_autospec import pytest -from databricks.sdk.service.jobs import Job, SparkPythonTask +from databricks.sdk.service.jobs import Job, SparkPythonTask, JobSettings, Task from databricks.sdk.service.pipelines import NotebookLibrary, GetPipelineResponse, PipelineLibrary, FileLibrary from databricks.labs.blueprint.paths import DBFSPath, WorkspacePath @@ -18,8 +19,13 @@ from databricks.sdk.service.workspace import ExportFormat from databricks.labs.ucx.source_code.linters.files import FileLoader, ImportFileResolver -from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver -from databricks.labs.ucx.source_code.jobs import JobProblem, WorkflowLinter, WorkflowTaskContainer +from databricks.labs.ucx.source_code.graph import ( + Dependency, + DependencyGraph, + DependencyResolver, + LineageAtom, +) +from databricks.labs.ucx.source_code.jobs import JobProblem, WorkflowLinter, WorkflowTaskContainer, WorkflowTask from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader @@ -512,3 +518,23 @@ def test_xxx(graph): assert workflow_task_container.spark_conf == {"spark.databricks.cluster.profile": "singleNode"} ws.assert_not_called() + + +def test_full_lineage_is_converted_to_json(): + ws = create_autospec(WorkspaceClient) + ws.assert_not_called() + task = Task(task_key="task-key") + settings = JobSettings(name="job-name") + job = create_autospec(jobs.Job) + job.job_id = "job-id" + job.settings = settings + wtask = WorkflowTask(ws, task, job) + full_lineage = list(itertools.chain(wtask.lineage, [LineageAtom("path", "abc"), LineageAtom("path", "xyz")])) + json_str = LineageAtom.atoms_to_json_string(full_lineage) + job.assert_not_called() + assert json_str == ( + '[{"object_type": "job", "object_id": "job-id", "name": "job-name"}, ' + '{"object_type": "task", "object_id": "task-key"}, ' + '{"object_type": "path", "object_id": "abc"}, ' + '{"object_type": "path", "object_id": "xyz"}]' + ) From 8cb4ac04ae9c05df94d5d1ef71a1af8d77e5a1df Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 11 Sep 2024 16:54:30 +0200 Subject: [PATCH 67/80] fix merge issues --- src/databricks/labs/ucx/source_code/linters/pyspark.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index c442af9fd6..86cc274b87 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -238,10 +238,10 @@ def __init__(self, dfsa_matchers_only: bool): # see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.SparkSession.html # spark.sql is handled by a dedicated linter - spark_session_matchers = [SparkCallMatcher("table", 1, 1, 0)] + spark_session_matchers: list[_TableNameMatcher] = [SparkCallMatcher("table", 1, 1, 0)] # see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.Catalog.html - spark_catalog_matchers = [ + spark_catalog_matchers: list[_TableNameMatcher] = [ SparkCallMatcher("cacheTable", 1, 2, 0, "tableName"), SparkCallMatcher("createTable", 1, 1000, 0, "tableName"), SparkCallMatcher("createExternalTable", 1, 1000, 0, "tableName"), @@ -256,7 +256,7 @@ def __init__(self, dfsa_matchers_only: bool): ] # see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html - spark_dataframe_matchers = [ + spark_dataframe_matchers: list[_TableNameMatcher] = [ SparkCallMatcher("writeTo", 1, 1, 0), ] @@ -270,12 +270,12 @@ def __init__(self, dfsa_matchers_only: bool): # nothing to migrate in Window, see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.Window.html # see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameReader.html - spark_dataframereader_matchers = [ + spark_dataframereader_matchers: list[_TableNameMatcher] = [ SparkCallMatcher("table", 1, 1, 0), # TODO good example of collision, see spark_session_calls ] # see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.html - spark_dataframewriter_matchers = [ + spark_dataframewriter_matchers: list[_TableNameMatcher] = [ SparkCallMatcher("insertInto", 1, 2, 0, "tableName"), # TODO jdbc: could the url be a databricks url, raise warning ? SparkCallMatcher("saveAsTable", 1, 4, 0, "name"), From 185d06094977330abdb7118d5634a54017361b6b Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 12 Sep 2024 09:33:49 +0200 Subject: [PATCH 68/80] Update src/databricks/labs/ucx/source_code/base.py Co-authored-by: Serge Smertin <259697+nfx@users.noreply.github.com> --- src/databricks/labs/ucx/source_code/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index a99b3a5a86..1ee2917d21 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -349,13 +349,13 @@ class DirectFsAccess: is_read: bool is_write: bool source_id: str = UNKNOWN - source_timestamp: int = -1 - source_lineage: str = UNKNOWN + source_timestamp: datetime.datetime + source_lineage: list[LineageAtom] job_id: int = -1 job_name: str = UNKNOWN task_key: str = UNKNOWN - assessment_start_timestamp: int = -1 - assessment_end_timestamp: int = -1 + assessment_start_timestamp: datetime.datetime + assessment_end_timestamp: datetime.datetime def replace_source( self, From 1742415787038a27098ca14912fc183a0009eebd Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 12 Sep 2024 10:36:47 +0200 Subject: [PATCH 69/80] refactor DirectFsAccess --- .../labs/ucx/contexts/application.py | 2 +- src/databricks/labs/ucx/source_code/base.py | 74 ---------- .../labs/ucx/source_code/directfs_access.py | 136 ++++++++++++++++++ .../source_code/directfs_access_crawler.py | 49 ------- src/databricks/labs/ucx/source_code/graph.py | 23 +-- src/databricks/labs/ucx/source_code/jobs.py | 28 ++-- .../labs/ucx/source_code/linters/directfs.py | 2 +- tests/integration/source_code/test_jobs.py | 3 +- ...ess_crawler.py => test_directfs_access.py} | 13 +- tests/unit/source_code/test_jobs.py | 28 +--- 10 files changed, 165 insertions(+), 193 deletions(-) create mode 100644 src/databricks/labs/ucx/source_code/directfs_access.py delete mode 100644 src/databricks/labs/ucx/source_code/directfs_access_crawler.py rename tests/unit/source_code/{test_directfs_access_crawler.py => test_directfs_access.py} (60%) diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 6ba45ef469..6dc9a649d1 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -15,7 +15,7 @@ from databricks.labs.ucx.recon.metadata_retriever import DatabricksTableMetadataRetriever from databricks.labs.ucx.recon.migration_recon import MigrationRecon from databricks.labs.ucx.recon.schema_comparator import StandardSchemaComparator -from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawlers +from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawlers from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver from databricks.sdk import AccountClient, WorkspaceClient, core from databricks.sdk.errors import ResourceDoesNotExist diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index 1ee2917d21..1a21264d5d 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -337,77 +337,3 @@ def is_a_notebook(path: Path, content: str | None = None) -> bool: logger.warning(f"Could not read file {path}") return False return file_header == magic_header - - -@dataclass -class DirectFsAccess: - """A record describing a Direct File System Access""" - - UNKNOWN = "unknown" - - path: str - is_read: bool - is_write: bool - source_id: str = UNKNOWN - source_timestamp: datetime.datetime - source_lineage: list[LineageAtom] - job_id: int = -1 - job_name: str = UNKNOWN - task_key: str = UNKNOWN - assessment_start_timestamp: datetime.datetime - assessment_end_timestamp: datetime.datetime - - def replace_source( - self, - source_id: str | None = None, - source_lineage: str | None = None, - source_timestamp: int | None = None, - ): - return DirectFsAccess( - path=self.path, - is_read=self.is_read, - is_write=self.is_write, - source_id=source_id or self.source_id, - source_timestamp=source_timestamp or self.source_timestamp, - source_lineage=source_lineage or self.source_lineage, - job_id=self.job_id, - job_name=self.job_name, - task_key=self.task_key, - assessment_start_timestamp=self.assessment_start_timestamp, - assessment_end_timestamp=self.assessment_start_timestamp, - ) - - def replace_job_infos( - self, - job_id: int | None = None, - job_name: str | None = None, - task_key: str | None = None, - ): - return DirectFsAccess( - path=self.path, - is_read=self.is_read, - is_write=self.is_write, - source_id=self.source_id, - source_timestamp=self.source_timestamp, - source_lineage=self.source_lineage, - job_id=job_id or self.job_id, - job_name=job_name or self.job_name, - task_key=task_key or self.task_key, - assessment_start_timestamp=self.assessment_start_timestamp, - assessment_end_timestamp=self.assessment_start_timestamp, - ) - - def replace_assessment_infos(self, assessment_start: int | None = None, assessment_end: int | None = None): - return DirectFsAccess( - path=self.path, - is_read=self.is_read, - is_write=self.is_write, - source_id=self.source_id, - source_timestamp=self.source_timestamp, - source_lineage=self.source_lineage, - job_id=self.job_id, - job_name=self.job_name, - task_key=self.task_key, - assessment_start_timestamp=assessment_start or self.assessment_start_timestamp, - assessment_end_timestamp=assessment_end or self.assessment_start_timestamp, - ) diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py new file mode 100644 index 0000000000..954d5094a2 --- /dev/null +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -0,0 +1,136 @@ +from __future__ import annotations + + +import logging +from collections.abc import Sequence, Iterable +from dataclasses import dataclass, field +from datetime import datetime + +from databricks.labs.ucx.framework.crawlers import CrawlerBase, Result +from databricks.labs.lsql.backends import SqlBackend +from databricks.sdk.errors import DatabricksError + +logger = logging.getLogger(__name__) + + +@dataclass +class LineageAtom: + + object_type: str + object_id: str + other: dict[str, str] | None = None + + +@dataclass +class DirectFsAccess: + """A record describing a Direct File System Access""" + + UNKNOWN = "unknown" + + path: str + is_read: bool + is_write: bool + source_id: str = UNKNOWN + source_timestamp: datetime = datetime.fromtimestamp(-1) + source_lineage: list[LineageAtom] = field(default_factory=list) + job_id: int = -1 + job_name: str = UNKNOWN + task_key: str = UNKNOWN + assessment_start_timestamp: datetime = datetime.fromtimestamp(-1) + assessment_end_timestamp: datetime = datetime.fromtimestamp(-1) + + def replace_source( + self, + source_id: str | None = None, + source_lineage: list[LineageAtom] | None = None, + source_timestamp: datetime | None = None, + ): + return DirectFsAccess( + path=self.path, + is_read=self.is_read, + is_write=self.is_write, + source_id=source_id or self.source_id, + source_timestamp=source_timestamp or self.source_timestamp, + source_lineage=source_lineage or self.source_lineage, + job_id=self.job_id, + job_name=self.job_name, + task_key=self.task_key, + assessment_start_timestamp=self.assessment_start_timestamp, + assessment_end_timestamp=self.assessment_start_timestamp, + ) + + def replace_job_infos( + self, + job_id: int | None = None, + job_name: str | None = None, + task_key: str | None = None, + ): + return DirectFsAccess( + path=self.path, + is_read=self.is_read, + is_write=self.is_write, + source_id=self.source_id, + source_timestamp=self.source_timestamp, + source_lineage=self.source_lineage, + job_id=job_id or self.job_id, + job_name=job_name or self.job_name, + task_key=task_key or self.task_key, + assessment_start_timestamp=self.assessment_start_timestamp, + assessment_end_timestamp=self.assessment_start_timestamp, + ) + + def replace_assessment_infos( + self, assessment_start: datetime | None = None, assessment_end: datetime | None = None + ): + return DirectFsAccess( + path=self.path, + is_read=self.is_read, + is_write=self.is_write, + source_id=self.source_id, + source_timestamp=self.source_timestamp, + source_lineage=self.source_lineage, + job_id=self.job_id, + job_name=self.job_name, + task_key=self.task_key, + assessment_start_timestamp=assessment_start or self.assessment_start_timestamp, + assessment_end_timestamp=assessment_end or self.assessment_start_timestamp, + ) + + +class _DirectFsAccessCrawler(CrawlerBase): + + def __init__(self, backend: SqlBackend, schema: str, table: str): + """ + Initializes a DFSACrawler instance. + + Args: + sql_backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark) + schema: The schema name for the inventory persistence. + """ + super().__init__(backend, "hive_metastore", schema, table, DirectFsAccess) + + def append(self, dfsas: Sequence[DirectFsAccess]): + try: + self._append_records(dfsas) + except DatabricksError as e: + logger.error("Failed to store DFSAs", exc_info=e) + + def _try_fetch(self) -> Iterable[DirectFsAccess]: + sql = f"SELECT * FROM {self.full_name}" + yield from self._backend.fetch(sql) + + def _crawl(self) -> Iterable[Result]: + return [] + + +class DirectFsAccessCrawlers: + + def __init__(self, sql_backend: SqlBackend, schema: str): + self._sql_backend = sql_backend + self._schema = schema + + def for_paths(self) -> _DirectFsAccessCrawler: + return _DirectFsAccessCrawler(self._sql_backend, self._schema, "direct_file_system_access_in_paths") + + def for_queries(self) -> _DirectFsAccessCrawler: + return _DirectFsAccessCrawler(self._sql_backend, self._schema, "direct_file_system_access_in_queries") diff --git a/src/databricks/labs/ucx/source_code/directfs_access_crawler.py b/src/databricks/labs/ucx/source_code/directfs_access_crawler.py deleted file mode 100644 index a05c2079db..0000000000 --- a/src/databricks/labs/ucx/source_code/directfs_access_crawler.py +++ /dev/null @@ -1,49 +0,0 @@ -import logging -from collections.abc import Sequence, Iterable - -from databricks.labs.ucx.framework.crawlers import CrawlerBase, Result -from databricks.labs.lsql.backends import SqlBackend -from databricks.sdk.errors import DatabricksError - -from databricks.labs.ucx.source_code.base import DirectFsAccess - -logger = logging.getLogger(__name__) - - -class _DirectFsAccessCrawler(CrawlerBase): - - def __init__(self, backend: SqlBackend, schema: str, table: str): - """ - Initializes a DFSACrawler instance. - - Args: - sql_backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark) - schema: The schema name for the inventory persistence. - """ - super().__init__(backend, "hive_metastore", schema, table, DirectFsAccess) - - def append(self, dfsas: Sequence[DirectFsAccess]): - try: - self._append_records(dfsas) - except DatabricksError as e: - logger.error("Failed to store DFSAs", exc_info=e) - - def _try_fetch(self) -> Iterable[DirectFsAccess]: - sql = f"SELECT * FROM {self.full_name}" - yield from self._backend.fetch(sql) - - def _crawl(self) -> Iterable[Result]: - return [] - - -class DirectFsAccessCrawlers: - - def __init__(self, sql_backend: SqlBackend, schema: str): - self._sql_backend = sql_backend - self._schema = schema - - def for_paths(self) -> _DirectFsAccessCrawler: - return _DirectFsAccessCrawler(self._sql_backend, self._schema, "direct_file_system_access_in_paths") - - def for_queries(self) -> _DirectFsAccessCrawler: - return _DirectFsAccessCrawler(self._sql_backend, self._schema, "direct_file_system_access_in_queries") diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py index e316125c93..e044b7e26a 100644 --- a/src/databricks/labs/ucx/source_code/graph.py +++ b/src/databricks/labs/ucx/source_code/graph.py @@ -2,7 +2,6 @@ import abc import itertools -import json import logging from dataclasses import dataclass from pathlib import Path @@ -13,6 +12,7 @@ NodeNG, ) from databricks.labs.ucx.source_code.base import Advisory, CurrentSessionState, is_a_notebook +from databricks.labs.ucx.source_code.directfs_access import LineageAtom from databricks.labs.ucx.source_code.python.python_ast import Tree from databricks.labs.ucx.source_code.path_lookup import PathLookup @@ -605,23 +605,6 @@ def finalize(self) -> InheritedContext: return InheritedContext(tree, self.found) -@dataclass -class LineageAtom: - - @staticmethod - def atoms_to_json_string(atoms: list[LineageAtom]): - json_lists = list(lineage.as_objects() for lineage in atoms) - json_obj = list(itertools.chain(*json_lists)) - return json.dumps(json_obj) - - object_type: str - object_id: str - other: dict[str, str] | None = None - - def as_objects(self) -> list[dict[str, str]]: - return [{"object_type": self.object_type, "object_id": self.object_id, **(self.other or {})}] - - T = TypeVar("T") @@ -668,5 +651,5 @@ def _process_dependency( @property def lineage(self) -> list[LineageAtom]: - lineages = [dependency.lineage for dependency in self._lineage] - return list(itertools.chain(*lineages)) + lists: list[list[LineageAtom]] = [dependency.lineage for dependency in self._lineage] + return list(itertools.chain(*lists)) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 8cf02441a5..144cf0304a 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -2,10 +2,10 @@ import logging import shutil import tempfile -import time from collections.abc import Generator, Iterable from contextlib import contextmanager from dataclasses import dataclass +from datetime import datetime from importlib import metadata from pathlib import Path from urllib import parse @@ -26,9 +26,8 @@ is_a_notebook, file_language, guess_encoding, - DirectFsAccess, ) -from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawlers +from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess, LineageAtom, DirectFsAccessCrawlers from databricks.labs.ucx.source_code.graph import ( Dependency, DependencyGraph, @@ -37,7 +36,6 @@ SourceContainer, WrappingLoader, DependencyGraphWalker, - LineageAtom, ) from databricks.labs.ucx.source_code.linters.context import LinterContext from databricks.labs.ucx.source_code.linters.directfs import DirectFsAccessPyLinter, DirectFsAccessSqlLinter @@ -417,9 +415,9 @@ def _lint_job(self, job: jobs.Job) -> tuple[list[JobProblem], list[DirectFsAcces end_col=advice.advice.end_col, ) problems.append(job_problem) - assessment_start = int(time.mktime(time.gmtime())) + assessment_start = datetime.now() task_dfsas = self._collect_task_dfsas(task, job, graph, session_state) - assessment_end = int(time.mktime(time.gmtime())) + assessment_end = datetime.now() for dfsa in task_dfsas: dfsa = dfsa.replace_assessment_infos(assessment_start=assessment_start, assessment_end=assessment_end) dfsas.append(dfsa) @@ -530,18 +528,17 @@ def _collect_from_notebook( if isinstance(path, WorkspacePath): # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142 # pylint: disable=protected-access - src_timestamp = path._object_info.modified_at or -1 + src_timestamp = datetime.fromtimestamp(path._object_info.modified_at or -1) elif isinstance(path, DBFSPath): # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143 # pylint: disable=protected-access - src_timestamp = path._file_info.modification_time or -1 + src_timestamp = datetime.fromtimestamp(path._file_info.modification_time or -1) else: - src_timestamp = int(path.stat().st_mtime) + src_timestamp = datetime.fromtimestamp(path.stat().st_mtime) src_id = str(path) - src_lineage = LineageAtom.atoms_to_json_string(self.lineage) for cell in notebook.cells: for dfsa in self._collect_from_source(cell.original_code, cell.language, path, inherited_tree): - yield dfsa.replace_source(source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp) + yield dfsa.replace_source(source_id=src_id, source_lineage=self.lineage, source_timestamp=src_timestamp) if cell.language is CellLanguage.PYTHON: if inherited_tree is None: inherited_tree = Tree.new_module() @@ -562,17 +559,16 @@ def _collect_from_source( if isinstance(path, WorkspacePath): # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142 # pylint: disable=protected-access - src_timestamp = path._object_info.modified_at or -1 + src_timestamp = datetime.fromtimestamp(path._object_info.modified_at or -1) elif isinstance(path, DBFSPath): # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143 # pylint: disable=protected-access - src_timestamp = path._file_info.modification_time or -1 + src_timestamp = datetime.fromtimestamp(path._file_info.modification_time or -1) else: - src_timestamp = int(path.stat().st_mtime) + src_timestamp = datetime.fromtimestamp(path.stat().st_mtime) src_id = str(path) - src_lineage = LineageAtom.atoms_to_json_string(self.lineage) for dfsa in iterable: - yield dfsa.replace_source(source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp) + yield dfsa.replace_source(source_id=src_id, source_lineage=self.lineage, source_timestamp=src_timestamp) def _collect_from_python(self, source: str, inherited_tree: Tree | None) -> Iterable[DirectFsAccess]: linter = DirectFsAccessPyLinter(self._session_state, prevent_spark_duplicates=False) diff --git a/src/databricks/labs/ucx/source_code/linters/directfs.py b/src/databricks/labs/ucx/source_code/linters/directfs.py index 4a8c01fa67..2eb14b03db 100644 --- a/src/databricks/labs/ucx/source_code/linters/directfs.py +++ b/src/databricks/labs/ucx/source_code/linters/directfs.py @@ -13,8 +13,8 @@ CurrentSessionState, PythonLinter, SqlLinter, - DirectFsAccess, ) +from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeVisitor from databricks.labs.ucx.source_code.python.python_infer import InferredValue diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index 11fde9dfe8..c741322748 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -20,7 +20,8 @@ from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex from databricks.labs.ucx.mixins.fixtures import get_purge_suffix, factory -from databricks.labs.ucx.source_code.base import CurrentSessionState, DirectFsAccess +from databricks.labs.ucx.source_code.base import CurrentSessionState +from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess from databricks.labs.ucx.source_code.graph import Dependency from databricks.labs.ucx.source_code.known import UNKNOWN, KnownList from databricks.labs.ucx.source_code.linters.files import LocalCodeLinter, FileLoader, FolderLoader diff --git a/tests/unit/source_code/test_directfs_access_crawler.py b/tests/unit/source_code/test_directfs_access.py similarity index 60% rename from tests/unit/source_code/test_directfs_access_crawler.py rename to tests/unit/source_code/test_directfs_access.py index cd38435e66..6adfff7cc8 100644 --- a/tests/unit/source_code/test_directfs_access_crawler.py +++ b/tests/unit/source_code/test_directfs_access.py @@ -1,7 +1,8 @@ +from datetime import datetime + from databricks.labs.lsql.backends import MockBackend -from databricks.labs.ucx.source_code.base import DirectFsAccess -from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawlers +from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess, DirectFsAccessCrawlers, LineageAtom def test_crawler_appends_dfsas(): @@ -13,13 +14,13 @@ def test_crawler_appends_dfsas(): is_read=False, is_write=False, source_id="ID", - source_timestamp=7452, - source_lineage="LINEAGE", + source_timestamp=datetime.now(), + source_lineage=[LineageAtom(object_type="LINEAGE", object_id="ID")], job_id=222, job_name="JOB", task_key="TASK", - assessment_start_timestamp=123, - assessment_end_timestamp=234, + assessment_start_timestamp=datetime.now(), + assessment_end_timestamp=datetime.now(), ) for path in ("a", "b", "c") ) diff --git a/tests/unit/source_code/test_jobs.py b/tests/unit/source_code/test_jobs.py index 7b441764b2..9dc67f07a3 100644 --- a/tests/unit/source_code/test_jobs.py +++ b/tests/unit/source_code/test_jobs.py @@ -1,17 +1,16 @@ import io -import itertools import logging import textwrap from pathlib import Path from unittest.mock import create_autospec import pytest -from databricks.sdk.service.jobs import Job, SparkPythonTask, JobSettings, Task +from databricks.sdk.service.jobs import Job, SparkPythonTask from databricks.sdk.service.pipelines import NotebookLibrary, GetPipelineResponse, PipelineLibrary, FileLibrary from databricks.labs.blueprint.paths import DBFSPath, WorkspacePath from databricks.labs.ucx.source_code.base import CurrentSessionState -from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawlers +from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawlers from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver from databricks.labs.ucx.source_code.known import KnownList from databricks.sdk import WorkspaceClient @@ -23,9 +22,8 @@ Dependency, DependencyGraph, DependencyResolver, - LineageAtom, ) -from databricks.labs.ucx.source_code.jobs import JobProblem, WorkflowLinter, WorkflowTaskContainer, WorkflowTask +from databricks.labs.ucx.source_code.jobs import JobProblem, WorkflowLinter, WorkflowTaskContainer from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader @@ -518,23 +516,3 @@ def test_xxx(graph): assert workflow_task_container.spark_conf == {"spark.databricks.cluster.profile": "singleNode"} ws.assert_not_called() - - -def test_full_lineage_is_converted_to_json(): - ws = create_autospec(WorkspaceClient) - ws.assert_not_called() - task = Task(task_key="task-key") - settings = JobSettings(name="job-name") - job = create_autospec(jobs.Job) - job.job_id = "job-id" - job.settings = settings - wtask = WorkflowTask(ws, task, job) - full_lineage = list(itertools.chain(wtask.lineage, [LineageAtom("path", "abc"), LineageAtom("path", "xyz")])) - json_str = LineageAtom.atoms_to_json_string(full_lineage) - job.assert_not_called() - assert json_str == ( - '[{"object_type": "job", "object_id": "job-id", "name": "job-name"}, ' - '{"object_type": "task", "object_id": "task-key"}, ' - '{"object_type": "path", "object_id": "abc"}, ' - '{"object_type": "path", "object_id": "xyz"}]' - ) From c5987bb982fb67e92e908c9976a26f1eae701e67 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 12 Sep 2024 10:41:39 +0200 Subject: [PATCH 70/80] add view --- src/databricks/labs/ucx/queries/views/direct_fs_access.sql | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 src/databricks/labs/ucx/queries/views/direct_fs_access.sql diff --git a/src/databricks/labs/ucx/queries/views/direct_fs_access.sql b/src/databricks/labs/ucx/queries/views/direct_fs_access.sql new file mode 100644 index 0000000000..86ce1d26c8 --- /dev/null +++ b/src/databricks/labs/ucx/queries/views/direct_fs_access.sql @@ -0,0 +1,7 @@ +SELECT + * +FROM direct_file_system_access_in_paths +UNION +SELECT + * +FROM direct_file_system_access_in_queries From b6fbeb47777d57259ae19d3f3bb31a53b353f07b Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 12 Sep 2024 12:30:15 +0200 Subject: [PATCH 71/80] fix failing tests --- .../labs/ucx/source_code/directfs_access.py | 6 +-- src/databricks/labs/ucx/source_code/jobs.py | 39 +++++++++---------- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py index 954d5094a2..76041a2688 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access.py +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -31,13 +31,13 @@ class DirectFsAccess: is_read: bool is_write: bool source_id: str = UNKNOWN - source_timestamp: datetime = datetime.fromtimestamp(-1) + source_timestamp: datetime = datetime.fromtimestamp(0) source_lineage: list[LineageAtom] = field(default_factory=list) job_id: int = -1 job_name: str = UNKNOWN task_key: str = UNKNOWN - assessment_start_timestamp: datetime = datetime.fromtimestamp(-1) - assessment_end_timestamp: datetime = datetime.fromtimestamp(-1) + assessment_start_timestamp: datetime = datetime.fromtimestamp(0) + assessment_end_timestamp: datetime = datetime.fromtimestamp(0) def replace_source( self, diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 144cf0304a..88029616fa 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -5,7 +5,7 @@ from collections.abc import Generator, Iterable from contextlib import contextmanager from dataclasses import dataclass -from datetime import datetime +from datetime import datetime, timezone from importlib import metadata from pathlib import Path from urllib import parse @@ -495,6 +495,21 @@ def _process_dependency( yield LocatedAdvice(advice, dependency.path) +def _get_path_modified_datetime(path: Path) -> datetime: + unix_time = 0.0 + if isinstance(path, WorkspacePath): + # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142 + # pylint: disable=protected-access + unix_time += float(path._object_info.modified_at) / 1000.0 or 0.0 + elif isinstance(path, DBFSPath): + # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143 + # pylint: disable=protected-access + unix_time += float(path._file_info.modification_time) / 1000.0 or 0.0 + else: + unix_time = path.stat().st_mtime + return datetime.fromtimestamp(unix_time, timezone.utc) + + class DfsaCollectorWalker(DependencyGraphWalker[DirectFsAccess]): def __init__( @@ -525,16 +540,7 @@ def _collect_from_notebook( self, source: str, language: CellLanguage, path: Path, inherited_tree: Tree | None ) -> Iterable[DirectFsAccess]: notebook = Notebook.parse(path, source, language.language) - if isinstance(path, WorkspacePath): - # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142 - # pylint: disable=protected-access - src_timestamp = datetime.fromtimestamp(path._object_info.modified_at or -1) - elif isinstance(path, DBFSPath): - # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143 - # pylint: disable=protected-access - src_timestamp = datetime.fromtimestamp(path._file_info.modification_time or -1) - else: - src_timestamp = datetime.fromtimestamp(path.stat().st_mtime) + src_timestamp = _get_path_modified_datetime(path) src_id = str(path) for cell in notebook.cells: for dfsa in self._collect_from_source(cell.original_code, cell.language, path, inherited_tree): @@ -556,16 +562,7 @@ def _collect_from_source( if iterable is None: logger.warning(f"Language {language.name} not supported yet!") return - if isinstance(path, WorkspacePath): - # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142 - # pylint: disable=protected-access - src_timestamp = datetime.fromtimestamp(path._object_info.modified_at or -1) - elif isinstance(path, DBFSPath): - # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143 - # pylint: disable=protected-access - src_timestamp = datetime.fromtimestamp(path._file_info.modification_time or -1) - else: - src_timestamp = datetime.fromtimestamp(path.stat().st_mtime) + src_timestamp = _get_path_modified_datetime(path) src_id = str(path) for dfsa in iterable: yield dfsa.replace_source(source_id=src_id, source_lineage=self.lineage, source_timestamp=src_timestamp) From cc194b605230713fb2ca0ab4e4626630fbebcd88 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 12 Sep 2024 12:36:13 +0200 Subject: [PATCH 72/80] formatting --- src/databricks/labs/ucx/source_code/jobs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 88029616fa..50256f02b4 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -500,11 +500,11 @@ def _get_path_modified_datetime(path: Path) -> datetime: if isinstance(path, WorkspacePath): # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142 # pylint: disable=protected-access - unix_time += float(path._object_info.modified_at) / 1000.0 or 0.0 + unix_time += float(path._object_info.modified_at) / 1000.0 if path._object_info.modified_at else 0.0 elif isinstance(path, DBFSPath): # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143 # pylint: disable=protected-access - unix_time += float(path._file_info.modification_time) / 1000.0 or 0.0 + unix_time += float(path._file_info.modification_time) / 1000.0 if path._file_info.modification_time else 0.0 else: unix_time = path.stat().st_mtime return datetime.fromtimestamp(unix_time, timezone.utc) From a5ada24144a21e6be5f51577db3846c715dc719b Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 12 Sep 2024 15:09:57 +0200 Subject: [PATCH 73/80] install added table --- src/databricks/labs/ucx/install.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py index 42981c768f..b1631deb6b 100644 --- a/src/databricks/labs/ucx/install.py +++ b/src/databricks/labs/ucx/install.py @@ -75,6 +75,7 @@ from databricks.labs.ucx.installer.workflows import WorkflowsDeployment from databricks.labs.ucx.recon.migration_recon import ReconResult from databricks.labs.ucx.runtime import Workflows +from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess from databricks.labs.ucx.source_code.jobs import JobProblem from databricks.labs.ucx.workspace_access.base import Permissions from databricks.labs.ucx.workspace_access.generic import WorkspaceObjectInfo @@ -120,6 +121,7 @@ def deploy_schema(sql_backend: SqlBackend, inventory_schema: str): functools.partial(table, "udfs", Udf), functools.partial(table, "logs", LogRecord), functools.partial(table, "recon_results", ReconResult), + functools.partial(table, "direct_file_system_access_in_paths", DirectFsAccess), # direct_file_system_access_in_queries will be added in upcoming PR ], ) deployer.deploy_view("grant_detail", "queries/views/grant_detail.sql") @@ -128,6 +130,7 @@ def deploy_schema(sql_backend: SqlBackend, inventory_schema: str): deployer.deploy_view("misc_patterns", "queries/views/misc_patterns.sql") deployer.deploy_view("code_patterns", "queries/views/code_patterns.sql") deployer.deploy_view("reconciliation_results", "queries/views/reconciliation_results.sql") + # direct_file_system_access view will be added in upcoming PR def extract_major_minor(version_string): From 2ce8e42163e1e9b2e086de8c4e5258b70c62eecb Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 12 Sep 2024 17:34:32 +0200 Subject: [PATCH 74/80] address verbal comments from @asnare --- src/databricks/labs/ucx/source_code/directfs_access.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py index 81c59bb485..104cb634d1 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access.py +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -111,6 +111,7 @@ def __init__(self, backend: SqlBackend, schema: str, table: str): def append(self, dfsas: Sequence[DirectFsAccess]): try: + # TODO until we historize data, we append all DFSAs self._update_snapshot(dfsas, mode="append") except DatabricksError as e: logger.error("Failed to store DFSAs", exc_info=e) @@ -120,7 +121,7 @@ def _try_fetch(self) -> Iterable[DirectFsAccess]: yield from self._backend.fetch(sql) def _crawl(self) -> Iterable[DirectFsAccess]: - return [] + raise NotImplementedError() class DirectFsAccessCrawlers: From 145fbae075524af485b45db6dbbe8ba9a4265c96 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 16 Sep 2024 10:33:37 +0200 Subject: [PATCH 75/80] rename table and drop unused view --- src/databricks/labs/ucx/install.py | 4 ++-- src/databricks/labs/ucx/queries/views/direct_fs_access.sql | 7 ------- src/databricks/labs/ucx/source_code/directfs_access.py | 4 ++-- 3 files changed, 4 insertions(+), 11 deletions(-) delete mode 100644 src/databricks/labs/ucx/queries/views/direct_fs_access.sql diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py index 0888aee1b2..0529c1b94d 100644 --- a/src/databricks/labs/ucx/install.py +++ b/src/databricks/labs/ucx/install.py @@ -122,8 +122,8 @@ def deploy_schema(sql_backend: SqlBackend, inventory_schema: str): functools.partial(table, "logs", LogRecord), functools.partial(table, "recon_results", ReconResult), functools.partial( - table, "direct_file_system_access_in_paths", DirectFsAccess - ), # direct_file_system_access_in_queries will be added in upcoming PR + table, "directfs_in_paths", DirectFsAccess + ), # directfs_in_queries will be added in upcoming PR ], ) deployer.deploy_view("grant_detail", "queries/views/grant_detail.sql") diff --git a/src/databricks/labs/ucx/queries/views/direct_fs_access.sql b/src/databricks/labs/ucx/queries/views/direct_fs_access.sql deleted file mode 100644 index 86ce1d26c8..0000000000 --- a/src/databricks/labs/ucx/queries/views/direct_fs_access.sql +++ /dev/null @@ -1,7 +0,0 @@ -SELECT - * -FROM direct_file_system_access_in_paths -UNION -SELECT - * -FROM direct_file_system_access_in_queries diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py index 104cb634d1..8d8abf8f90 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access.py +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -131,7 +131,7 @@ def __init__(self, sql_backend: SqlBackend, schema: str): self._schema = schema def for_paths(self) -> _DirectFsAccessCrawler: - return _DirectFsAccessCrawler(self._sql_backend, self._schema, "direct_file_system_access_in_paths") + return _DirectFsAccessCrawler(self._sql_backend, self._schema, "directfs_in_paths") def for_queries(self) -> _DirectFsAccessCrawler: - return _DirectFsAccessCrawler(self._sql_backend, self._schema, "direct_file_system_access_in_queries") + return _DirectFsAccessCrawler(self._sql_backend, self._schema, "directfs_in_queries") From 810e3566f04aca022afabe6c6de8e718b40622c9 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 16 Sep 2024 10:36:39 +0200 Subject: [PATCH 76/80] rename method that is not yet in line with new crawler design --- src/databricks/labs/ucx/source_code/directfs_access.py | 2 +- src/databricks/labs/ucx/source_code/jobs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py index 8d8abf8f90..841bb837a8 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access.py +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -109,7 +109,7 @@ def __init__(self, backend: SqlBackend, schema: str, table: str): """ super().__init__(backend, "hive_metastore", schema, table, DirectFsAccess) - def append(self, dfsas: Sequence[DirectFsAccess]): + def dump_all(self, dfsas: Sequence[DirectFsAccess]): try: # TODO until we historize data, we append all DFSAs self._update_snapshot(dfsas, mode="append") diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index a58048aabe..de931cad2b 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -369,7 +369,7 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str): JobProblem, mode='overwrite', ) - self._directfs_crawlers.for_paths().append(job_dfsas) + self._directfs_crawlers.for_paths().dump_all(job_dfsas) if len(errors) > 0: raise ManyError(errors) From 9f24f9795acfadb53b5d44b15c6a7af8825a5067 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 16 Sep 2024 10:42:07 +0200 Subject: [PATCH 77/80] rename method that is not yet in line with new crawler design --- tests/unit/source_code/test_directfs_access.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/source_code/test_directfs_access.py b/tests/unit/source_code/test_directfs_access.py index 6adfff7cc8..7807e05f24 100644 --- a/tests/unit/source_code/test_directfs_access.py +++ b/tests/unit/source_code/test_directfs_access.py @@ -24,6 +24,6 @@ def test_crawler_appends_dfsas(): ) for path in ("a", "b", "c") ) - crawler.append(dfsas) + crawler.dump_all(dfsas) rows = backend.rows_written_for(crawler.full_name, "append") assert len(rows) == 3 From b93b565907d40fc57c2d546741e438905031cbb0 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 16 Sep 2024 11:18:43 +0200 Subject: [PATCH 78/80] Update src/databricks/labs/ucx/source_code/jobs.py Co-authored-by: Andrew Snare --- src/databricks/labs/ucx/source_code/jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index de931cad2b..82610f8dbb 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -80,7 +80,7 @@ def __repr__(self): @property def lineage(self) -> list[LineageAtom]: - job_name = ("" if self._job.settings is None else self._job.settings.name) or "unknown job" + job_name = (None if self._job.settings is None else self._job.settings.name) or "unknown job" job_lineage = LineageAtom("job", str(self._job.job_id), {"name": job_name}) task_lineage = LineageAtom("task", self._task.task_key) return [job_lineage, task_lineage] From 3f846a85a7ba362ccf2eafc253de1f1fa6248a4c Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 16 Sep 2024 11:24:23 +0200 Subject: [PATCH 79/80] document design decision --- src/databricks/labs/ucx/source_code/directfs_access.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py index 841bb837a8..342cbd7104 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access.py +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -110,6 +110,10 @@ def __init__(self, backend: SqlBackend, schema: str, table: str): super().__init__(backend, "hive_metastore", schema, table, DirectFsAccess) def dump_all(self, dfsas: Sequence[DirectFsAccess]): + """This crawler doesn't follow the pull model because the fetcher fetches data for 2 crawlers, not just one + It's not **bad** because all records are pushed at once. + Providing a multi-entity crawler is out-of-scope of this PR + """ try: # TODO until we historize data, we append all DFSAs self._update_snapshot(dfsas, mode="append") From 67f9d6846cac05b52657c68f0230d4fb00a80f12 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 16 Sep 2024 11:34:19 +0200 Subject: [PATCH 80/80] simplify --- src/databricks/labs/ucx/source_code/jobs.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index de931cad2b..4f98e24bec 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -496,15 +496,14 @@ def _process_dependency( def _get_path_modified_datetime(path: Path) -> datetime: - unix_time = 0.0 if isinstance(path, WorkspacePath): # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142 # pylint: disable=protected-access - unix_time += float(path._object_info.modified_at) / 1000.0 if path._object_info.modified_at else 0.0 + unix_time = float(path._object_info.modified_at) / 1000.0 if path._object_info.modified_at else 0.0 elif isinstance(path, DBFSPath): # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143 # pylint: disable=protected-access - unix_time += float(path._file_info.modification_time) / 1000.0 if path._file_info.modification_time else 0.0 + unix_time = float(path._file_info.modification_time) / 1000.0 if path._file_info.modification_time else 0.0 else: unix_time = path.stat().st_mtime return datetime.fromtimestamp(unix_time, timezone.utc)